예제 #1
0
def distanceMatrixCorrelation(matrix1, matrix2, weights = None,
                              collectComponents = False):
    """
    :param matrix1:
    :param matrix2:
    :param weights:
    :return: mean, and STD of the Kendal Tau Distances between all rows,
    and sorted list of names in the order of better correlations
    """

    size = matrix1.getSize()
    assert(size == matrix2.getSize())
    assert((not weights) or (size == weights.getSize()))
    kendallList = [None] * size
    weightsAllOnes = [1.0] * size
    compDict = DefDict(list)
    compSet = set()
    if collectComponents:
        for vl in matrix1.getArray():
            for v in vl:
                compSet.add(v)
    for i in range(size):
        components = DefDict(float)
        kendallList[i] = calculateWeightedKendall(matrix1[i],
            matrix2[i], weights = weights[i] if weights else None,
            components = components if collectComponents else None)
        for k in compSet:
            compDict[k].append(components[k])
    sortedNames = sorted(zip(matrix1.names, kendallList), key =
        operator.itemgetter(1))
    compList = None
    if collectComponents:
        compList = map(np.mean, map(operator.itemgetter(1),
            sorted(compDict.iteritems(), key = operator.itemgetter(0))))
    return (np.mean(kendallList), np.std(kendallList), sortedNames, compList)
예제 #2
0
파일: utils.py 프로젝트: hylkedonker/avenio
def dict_to_frame(gene_counts: defaultdict, index=None) -> pd.DataFrame:
    """
    Combine all distributions in `gene_counts` into a data frame.
    """
    index_cache = set()
    gene_series = {}
    genes = sorted(gene_counts.keys())
    for gene in genes:
        s = dict_as_series(gene_counts[gene], index)
        gene_series[gene] = s
        index_cache = index_cache.union(set(s.index))

    reindex = list(index_cache)
    reindex.sort()

    if index is not None and len(index) > len(reindex):
        reindex = index

    df = pd.DataFrame(gene_series, index=reindex,
                      columns=genes).fillna(0).astype(int)

    is_digit = map(
        lambda x: True
        if isinstance(x, str) and x.isdigit() else False, df.index)

    if all(is_digit):
        reindex = list(map(int, df.index))
        df.index = reindex
        return df.sort_index()

    return df
def frequency_search(freq: int, hist: defaultdict):
    word = []
    for v, k in hist.items():
        if k == freq:
            word.append(v)
    # print(word)
    return word
예제 #4
0
def fill_board_manhattan_distance_less_than(
        board: np.array, maximum: int,
        manhattan_distance_dict: collections.defaultdict) -> np.array:
    board_copy = board.copy()
    for index, distances in manhattan_distance_dict.items():
        sum_distances = sum(map(lambda x: x['distance'], distances))
        board_copy[index] = (1 if sum_distances < maximum else 0)
    return board_copy
예제 #5
0
 def __init__(self, maxCount, mode):
     self.maxCount = maxCount
     self.mode = mode
     self.fileList = []
     self.fileDict = {}
     self.fileCache = DefDict(list)
     self.hitCount = 0
     self.xactCount = 0
예제 #6
0
def utils_str_length_bigger(st: collections.defaultdict):
    """Return the longest string length in a given list"""
    m = 0
    for i in st.items():
        i = i[1][0]
        if len(i) > m:
            m = len(i)
    return m
예제 #7
0
파일: utils.py 프로젝트: mptich/shared
 def __init__(self, maxCount, mode):
     self.maxCount = maxCount
     self.mode = mode
     self.fileList = []
     self.fileDict = {}
     self.fileCache = DefDict(list)
     self.hitCount = 0
     self.xactCount = 0
예제 #8
0
def ToCsv(
  path: pathlib.Path, vocab_counts: defaultdict, node_count: int,
):
  vocab_entries = sorted(vocab_counts.items(), key=lambda x: -x[1])
  total_count = sum(vocab_counts.values())

  cumfreq = 0
  node_cumfreq = 0
  with open(str(path), "w") as f:
    writer = csv.writer(f, delimiter="\t")
    writer.writerow(
      ("cumulative_frequency", "cumulative_node_frequency", "count", "text",)
    )
    for text, count in vocab_entries:
      cumfreq += count / total_count
      node_cumfreq += count / node_count
      writer.writerow((cumfreq, node_cumfreq, count, text))
예제 #9
0
def get_embeddings_per_log(data: defaultdict,
                           model: fasttext.FastText) -> np.ndarray:
    # create embeddings per log but at first remove '\n' (newline character) from the end
    embeddings = [
        model.get_sentence_vector(log.rstrip()) for logs in data.values()
        for log in logs
    ]
    return np.asarray(embeddings)
예제 #10
0
def count_stats(stats_data: defaultdict):
    counter = {}
    for stat, inc in stats_data.items():
        if stat is enum.Enum:
            stat = stat.value
        counter[stat] = len(inc)

    return sorted(counter.items(), key=lambda i: i[1], reverse=True)
예제 #11
0
def solution(gender_prefer: str, geo_limit: int, profile: defaultdict):
    results = []
    for name in profile.keys():
        if profile[name][0] in gender_prefer and int(profile[name][1]) <= geo_limit:
            results.append(name)
    if not results:
        return 'No one yet'
    return '\n'.join(sorted(results))
예제 #12
0
파일: srl_utils.py 프로젝트: jgung/tf-nlp
 def _end(self, context: defaultdict, output_file: str) -> None:
     out_path = os.path.join(self.out_file, output_file + '.phrases.tsv')
     with open(out_path, 'w') as out:
         print('writing phrases to %s...' % out_path)
         for phrase_label, phrase in context.items():
             for span, count in phrase.items():
                 out.write('%s\t%d\t%s\n' %
                           (arg_to_a(phrase_label), count, span))
예제 #13
0
def _freeze_nested_defaultdict(d: defaultdict) -> dict:
    d = dict(d)

    for k, v in d.items():
        if isinstance(v, defaultdict):
            d[k] = _freeze_nested_defaultdict(v)

    return d
예제 #14
0
def get_average_scores(directors: defaultdict):
    """Iterate through the directors dict (returned by get_movies_by_director),
       return a list of tuples (director, average_score) ordered by highest
       score in descending order. Only take directors into account
       with >= MIN_MOVIES"""
    return sorted(
        [(d, calc_mean_score(directors[d]))
         for d in directors.keys() if len(directors[d]) >= MIN_MOVIES],
        key=lambda x: -x[1])
예제 #15
0
def guess_symbol(frequency_map: defaultdict, frequency: float) -> str:
    s = ''
    diff = 100.0
    for symbol, freq in frequency_map.items():
        if abs(frequency - freq) < diff:
            diff = abs(frequency - freq)
            s = symbol

    return s
예제 #16
0
def scanning_error_rate(ticket_rules: defaultdict, tickets: List[List[int]]) -> List[List[int]]:
    """Calculates the scanning error rate = the product of all values invalid for any field"""
    error_rate = sum(
        number
        for ticket in tickets
        for number in ticket
        if all(number not in field_range for field_range in ticket_rules.values())
    )
    return error_rate
예제 #17
0
    def findIsolatedEdges(self, d: defaultdict):

        isolated = []
        for key in d.keys():
            if (not d[key]):
                # print(" Isolated Node -> ", key)
                isolated.append(key)

        for i in isolated:
            print(" Isolated key -> ", i)
    def get_statistics_info(self, main_dict: defaultdict) -> defaultdict:
        average_by_all = self.get_average_profit_by_all_enterprize(main_dict)
        statistic_dict = defaultdict(list)
        for key, item in main_dict.items():
            if item <= average_by_all:
                statistic_dict["less_than_average"].append(key)
            else:
                statistic_dict["more_than_average"].append(key)

        return statistic_dict
예제 #19
0
        def dfs_explore_words(exploring_node: defaultdict, word_so_far: str) -> None:
            for c in sorted(exploring_node.keys()):
                if c != self.end_of_word:
                    if len(words_list) < max_recommendation:
                        dfs_explore_words(exploring_node[c], word_so_far + c)
                else:
                    words_list.append(word_so_far)

                if len(words_list) >= max_recommendation:
                    return
예제 #20
0
def filter_words(histogram: defaultdict):
    filtered_words = []
    for w, i in histogram.items():
        if w.endswith('\'s') and w[:-2] in histogram:
            histogram[w[:-2]] += i
            filtered_words.append(w)
        elif w.endswith('s') and w[:-1] in histogram:
            histogram[w[:-1]] += i
            filtered_words.append(w)
    return filtered_words
예제 #21
0
def compare_default_dicts(a: defaultdict, b: defaultdict) -> bool:
    """Compare two defaultdicts, return True if equal, else False.
    Does a benign or soft compare. If the defaultdicts COULD become
    equal, they are considered equal.

    * Does NOT change the memory imprint of any of the dictionaries.
    * Any overlapping keys, must have same value.
    * Keys unique to one, must have the default value of the other.
    * Order of input does NOT matter.

    Example:
    a = defaultdict(lambda: "", a=42, b=42, c="")
    b = defaultdict(lambda: 42, c="", d="")
    compare_defaultdicts(a, b) -> True

    Parameters
    ----------
    a : defaultdict
        Default dictionary from collections
    b : defaultdict:
        Default dictionary from collections

    Returns
    -------
    bool : True if equal, else False

    """
    a_keys = set(a)
    b_keys = set(b)
    a_unique_keys = (a_keys | b_keys) - b_keys
    b_unique_keys = (a_keys | b_keys) - a_keys

    # The intersecting keys must have the same value
    if not all(a[key] == b[key] for key in (a_keys & b_keys)):
        return False

    # Keys unique to one, must have default value of other.
    if not all(b.default_factory() == a[key] for key in a_unique_keys):
        return False
    if not all(a.default_factory() == b[key] for key in b_unique_keys):
        return False

    return True
예제 #22
0
def comb0(coins: list, deno: int, cur: defaultdict, ans: set):
    if deno < 0:
        return
    elif deno == 0:
        ans.add(tuple((key, val) for key, val in cur.items()))
    else:
        for coin in coins:
            cur[coin] += 1
            comb0(coins, deno - coin, cur, ans)
            cur[coin] -= 1
예제 #23
0
    def distanceBetweenNodes(self, start: str, end: str, graph: defaultdict):

        print(" start -> ", start, " end -> ", end, " graph -> ", graph)

        for k in graph.keys():
            if (k == start):
                print(graph[k])
                if (end in graph[k]):
                    print(" end is availale -> ", end, " key -> ", k)

        return None
예제 #24
0
def print_hull_painting(pc: collections.defaultdict):
    min_x, min_y, max_x, max_y = 0, 0, 0, 0
    for j in pc.keys():
        min_x, min_y, max_x, max_y = min(min_x, j[0]), min(min_y, j[1]), max(max_x, j[0]), max(max_y, j[1])
    x_range, y_range = max_x - min_x + 1, max_y - min_y + 1
    x_shift, y_shift = -1 * min_x, -1 * min_y
    print(f"x: [{min_x}, {max_x}]; y: [{min_y}, {max_y}]")
    print(f"x_range: {x_range}; y_range: {y_range}")
    print(f"x_shift: {x_shift}; y_shift: {y_shift}")
    color_map = {0: " ", 1: "*"}
    data = [[color_map[0] for x in range(x_range)] for y in range(y_range)]

    # Put colors in data
    for j in pc.keys():
        data[y_shift + j[1]][x_shift + j[0]] = color_map[pc[j]]

    print("----- Image -----")
    for j in reversed(range(y_range)):
        print("".join(data[j]))
    print("----- End Image -----")
예제 #25
0
def printFrequencies(Frequencies: defaultdict):
    sorted_dict = sorted(Frequencies.items(), key=lambda x: x[1],
                         reverse=True)  # O(n*log(n))
    # for t in sorted_dict: # O(n)
    #     print(t[0],"-> ",t[1]) # O(1) + O(1) + O(1) + O(1) = O(1)
    print(sorted_dict)
    file = open("outputA.txt", "w")
    for t in sorted_dict:  # O(n)
        s = t[0] + "-> " + str(t[1]) + '\n'
        file.write(s)  # O(1) + O(1) + O(1) + O(1) = O(1)
    file.close()
예제 #26
0
def fill_board_manhattan_distance(
        board: np.array,
        manhattan_distance_dict: collections.defaultdict) -> np.array:
    board_copy = board.copy()
    for index, distances in manhattan_distance_dict.items():
        min_distance = min(distances, key=lambda x: x['distance'])
        board_copy[index] = (min_distance['id'] if len([
            distance for distance in distances
            if (distance['distance'] == min_distance['distance'])
        ]) == 1 else '')
    return board_copy
예제 #27
0
def visualize(title: str, series: defaultdict, filename: str,
              per_continent: bool) -> None:
    if per_continent:
        world = pygal.maps.world.SupranationalWorld()
    else:
        world = pygal.maps.world.World()

    world.title = title
    for s in series.items():
        world.add(*s)
    world.render_to_file(f'images/{filename}.svg')
예제 #28
0
파일: utils.py 프로젝트: hylkedonker/avenio
def dict_sum(a: defaultdict, b: dict, inplace: bool = True) -> defaultdict:
    """
    Calculate a + b, key-by-key.
    """
    if not inplace:
        result = a.copy()
    else:
        result = a
    for key, value in b.items():
        result[key] += value
    return result
예제 #29
0
def defaultdict_to_dict(dictionary: defaultdict) -> Dict:
    """Recursively convert nested :obj:`defaultdict` to :obj:`dict`.

    Args:
        dictionary: A defaultdict.

    Returns:
        The defaultdict as a :obj:`dict`.
    """
    if isinstance(dictionary, defaultdict):
        dictionary = {k: defaultdict_to_dict(v) for k, v in dictionary.items()}
    return dictionary
def solution(n: int, islands: defaultdict, xs: defaultdict):
    ans = 0
    bit = BIT(n + 1)

    keys = sorted(islands.keys(), reverse=True)

    for key in keys:
        for x in sorted(islands[key]):
            ans += bit.query(1, xs[x])
            bit.update(xs[x], 1)

    return ans
예제 #31
0
def _fetch_broker_info(ctr_relation_buf: defaultdict):
    """Fetch broker information."""
    ctr_list = list()
    default_policy = {"broker_id": -1, "explorer_id": -1}
    for _broker, _explorer in ctr_relation_buf.items():
        default_policy.update({
            "broker_id": _broker,
            "explorer_id": list(_explorer)
        })
        ctr_list.append(default_policy.copy())

    return ctr_list
예제 #32
0
def get_embeddings_per_block(data: defaultdict, model: fasttext.FastText,
                             with_timedelta: bool) -> List:
    # create embeddings per block but at first remove '\n' (newline character) from the end
    if with_timedelta:
        embeddings = get_embeddings_with_timedeltas_per_block(data, model)
    else:
        embeddings = [
            np.asarray(
                [model.get_sentence_vector(log.rstrip()) for log in logs])
            for logs in data.values()
        ]
    return embeddings
예제 #33
0
def generate_table_rows(request_count: Counter, request_times:defaultdict) -> List[Dict]:
    table_rows = []
    time_total = sum(chain(*request_times.values()))
    count_total = sum(request_count.values())

    for url, times in request_times.items():
        count = request_count[url]
        times = request_times[url]
        time_sum = sum(times)

        table_rows.append({'count': count,
                           'url': url,
                           'count_perc': round(100 * count / count_total, 2),
                           'time_perc': round(100 * time_sum / time_total, 2),
                           'time_sum': round(time_sum, 2),
                           'time_avg': round(sum(times) / len(times), 2),
                           'time_max': round(max(times), 2),
                           'time_med': round(median(times), 2)})

    table_rows = sorted(table_rows, key=lambda x: x['time_sum'], reverse=True)
    return table_rows
def find_users(users: defaultdict, city_id, cities, needed_cities):
    response = pool.get_next_api().users.search(city = city_id
                                  , count = 1000
                                  , v="5.44"
    )
    count = response["count"]
    if count == 0:
        print ("no users")
        return

    # Если города ещё нет в базе пользователей
    if users.get(city_id) == None:
        users[city_id] = defaultdict()
        users[city_id]["users"] = set()
        users[city_id]["count"] = 0

    # Если скачано меньше 95% людей из этого города.
    if abs(count - len(users[city_id]["users"])) * 100 / count > 5:
        print( "percent: " , abs(count - len(users[city_id]["users"])) * 100 / count)

        # Если в базе есть люди из этого города
        if len(users[city_id]["users"]) != 0:
            users_before = set(users[city_id]["users"])
        else:
            users_before = set()

        # Добавляем людей, полученных в результате запроса в базу и в users_before
        for item in response["items"]:
            users_before.add(item["id"])
            users[city_id]["users"].add(item["id"])

        for user_id in users_before:
            try:
                response = pool.get_next_api().friends.get( user_id = user_id
                                              , order = "random"
                                              , fields = "city"
                                              , v="5.44")
            except vk.exceptions.VkAPIError as err:
                print(err)
                users[city_id]["users"].discard(user_id)
                continue

            add_users(users, response["items"], needed_cities)
            print("users in ", cities[city_id]["title"], " now: ", len(users[city_id]["users"]))
            # print(response["items"])
            dumpData(users,usersFile)
            if (abs(count - len(users[city_id]["users"])) * 100 / count < 5) or (len(users[city_id]["users"]) > count):
                print("less then 3% ")
                break
            sleep(randint(1,3))
    else:
        print ("City is exists")
def get_cities(cities: defaultdict, region_id: int, users, needed_cities):
    for city_tup in sorted(cities.items(), key=lambda y : y[1]["uc"], reverse=True):
        city = city_tup[1]
        city_id = city["id"]
        if city["uc"] == -1:
            cities[city_id]["uc"] = get_city_users_count(city_id)
            city["uc"] = cities[city_id]["uc"]
            dumpData(cities, citiesFile)

        if city["uc"] > 5000:
            print(city["id"], ": ", city["title"], " - ", city["uc"])
            needed_cities.add(city_id)
            if users.get(city_id) == None:
                users[city_id] = defaultdict()
                users[city_id]["users"] = set()
                users[city_id]["count"] = 0
                dumpData(users,usersFile)
            find_users(users, city["id"], cities, needed_cities)
            dumpData(users,usersFile)
            sleep(randint(1,3))
    return cities
def unique_words(hist: defaultdict):
    return len(hist.keys())
예제 #37
0
파일: moal.py 프로젝트: Thunder1989/SDB
#input1 = [i.strip().split('\\')[-1][:-5] for i in open('soda_pt_rice').readlines()]
label = test_label
name = []
for i in input1:
    s = re.findall('(?i)[a-z]{2,}',i)
    name.append(' '.join(s))
cv = CV(analyzer='char_wb', ngram_range=(3,4))
test_fn = cv.fit_transform(name).toarray()
#test_fd = test_fn

fold = 10
kf = KFold(len(test_fn), n_folds=fold, shuffle=True)
iteration = 100
#lr_ = LR() #clf for use
lr_ = SVC(kernel='linear', probability=True)
CI = DD() #confidence level for each oracle
acc_ = [[] for i in range(iteration)] #acc in each run for averaging
for train, test in kf:
    fd_ = []
    label_ = []
    #TBD: randomly pick two examples from diff classes as starting
    fd_.append(train[0])
    label_.append(test_label[train[0]])
    train = train[1:]
    #needs one more ex from a diff class
    tmp = 0
    for i in train:
        if test_label[i] == label_[0]:
            continue
        else:
            fd_.append(i)
예제 #38
0
 
 frenchTags = [f.strip() for f  in frenchTagset if f.strip() not in commonTags]
 engTags = [e.strip() for e in engTagset if e.strip() not in commonTags]
 
 frenchTagset.close()
 engTagset.close()
 
 print "Tagsets Loaded!!"
 
 enLines = [l.strip() for l in enData]
 frLines = [f.strip() for f in frData]
 alignLines = [a.strip() for a in alignFile]
 
 print "Pure Data and Alignments Loaded!!"
 
 contextDict = DD(int)
 
 testIndices = randomSampleTrain()
 
 print "Train and Test separated!!"
 
 trainIndices = [i for i in range(len(enLines)) if i not in testIndices]
 
 print "Starting Dictionary fillup!!"
 
 for k in range(1):
 
     count = 0
     
     for i in trainIndices:
     
예제 #39
0
파일: utils.py 프로젝트: mptich/shared
class UtilMultiFile(UtilObject):
    """
    Keeps specified number of files opened, for read or write
    Attributes:
        mode - read or write
        maxCount - maximum number of opened files at any given moment
        hitCount - number of open file hits
        xactCount - number of transactions (reads or writes)
        fileList - list of open file names, sorted by the time
        fileDict - map of file name to a file handle
    """

    def __init__(self, maxCount, mode):
        self.maxCount = maxCount
        self.mode = mode
        self.fileList = []
        self.fileDict = {}
        self.fileCache = DefDict(list)
        self.hitCount = 0
        self.xactCount = 0

    def write(self, fileName, line):
        # Try to cache it first
        lines = self.fileCache[fileName]
        lines.append(line)
        if len(lines) > 100:
            self.cacheFlush(fileName)

    def cacheFlush(self, fileName):
        assert(self.mode[0] in ('w', 'a'))
        f = self.fileHandle(fileName)
        for l in self.fileCache[fileName]:
            try:
                f.write(l)
            except IOError as e:
                print("Could not write to %s: error %d %s" % (
                    fileName, e.errno, e.strerror))
                return
        self.fileCache[fileName] = []
        self.xactCount += 1

    def fileHandle(self, fileName):
        if fileName not in self.fileDict:
            if len(self.fileList) == self.maxCount:
                oldFileName = self.fileList[0]
                self.fileDict[oldFileName].close()
                del self.fileDict[oldFileName]
                self.fileList = self.fileList[1:]
            try:
                f = open(fileName, self.mode)
            except IOError as e:
                print("Could not open %s: error %d %s" % (fileName, e.errno,
                                                          e.strerror))
                return None
            self.fileDict[fileName] = f
            self.fileList.append(fileName)
        else:
            self.hitCount += 1
        return self.fileDict[fileName]

    def closeAll(self):
        for fileName in self.fileCache.keys():
            self.cacheFlush(fileName)
        for f in self.fileDict.values():
            f.close()
        self.fileDict = {}
        self.fileList = []

    def getStats(self):
        return "%u hits out of %u transactions: %u%%" % (self.hitCount,
                self.xactCount, (100 * self.hitCount / self.xactCount) if
                self.xactCount else 0)
예제 #40
0
        cogRegInt = calculateCogRegInt(cogReg)
        cogDict, cogWeightDictList, taxaDict, taxDist = \
            buildCogTaxaDict(interpolationRange=range(cogRegInt, cogRegInt+2))
        cogDist = buildCogDistances(cogDict, cogWeightDictList,
            **CogDistOptimalParams)
        corr, std = calculateCorrelation(cogDist, taxDist)
        print("CORRELATION: %f STD: %f" % (corr, std))
        print("\nStoring COG distance dictionary...")
        UtilStore(cogDist, COG_DIST_DICT())
        sys.exit(0)

    if (len(sys.argv) == 2) and (sys.argv[1] == "distCounts"):
        print("Building dict of taxonomy dist counts...")
        _, _, taxaDict, taxDist = \
            buildCogTaxaDict(noWeights = True)
        genTaxDistCntDict = DefDict(lambda: [0] *
            (TaxaType.maxDistance() + 1))
        for dir, tdd in taxDist.items():
            for d in tdd.values():
                genTaxDistCntDict[dir][d] += 1
        UtilStore(genTaxDistCntDict, GENOME_TAX_DIST_CNT_DICT())
        ttTaxDistCntDict = {}
        for dir, l in genTaxDistCntDict.items():
            ttTaxDistCntDict[taxaDict[dir].type.key] = l
        UtilStore(ttTaxDistCntDict, TAXTYPE_TAX_DIST_CNT_DICT())
        sys.exit(0)

    print("WRONG COMMAND LINE")



예제 #41
0
def buildCogTaxaDict(noWeights = False, showCogFreqHist = False,
    interpolationRange = None):

    print("reading taxa dictionary...")
    taxaDict = UtilLoad(PROK_TAXA_DICT())
    print("Read %d organisms" % len(taxaDict))

    print("Reading cogDict...")
    cogDict = UtilLoad(COG_DICT())

    print("Building COG frequncies...")
    cogFreq = DefDict(int)
    for dir, cogs in cogDict.iteritems():
        for cname in cogs:
            cogFreq[cname] += 1

    if showCogFreqHist:
        print("Sowing cogFreq histogram...")
        UtilDrawHistogram(cogFreq.values(), show = True)

    temp = taxaDict.keys()
    for dir in temp:
        if dir not in cogDict:
            del taxaDict[dir]
    temp = cogDict.keys()
    for dir in temp:
        if dir not in taxaDict:
            del cogDict[dir]
    print("Valid set contains %d organisms" % len(cogDict))

    print("\nBuilding Taxonomy distances...")
    taxDist = DefDict(dict)
    for dir1, taxa1 in taxaDict.items():
        for dir2, taxa2 in taxaDict.items():
            d = taxa1.distance(taxa2)
            taxDist[dir1][dir2] = d

    # Optimization
    if noWeights:
        return (cogDict, None, taxaDict, taxDist)

    fname = COG_WEIGHTS_DICT_LIST()
    if os.path.isfile(fname):
        print("Loading cogWeightDictList...")
        cogWeightDictList = UtilLoad(fname, progrIndPeriod=100)
    else:
        print("Building cogWeightsDict...")
        cogWeightDictList = [DefDict(dict) for i \
            in range(0, COG_REG_STEP_COUNT+1)]
        if not interpolationRange:
            interpolationRange = range(0, COG_REG_STEP_COUNT+1)
        for i in interpolationRange:
            expCogReg = math.exp(COG_REG_LOWER + float(i) * COG_REG_STEP)
            print("\nexpCogReg %f" % expCogReg)
            cogWeightDict = cogWeightDictList[i]
            for ind, (dir1, cogs1) in enumerate(cogDict.iteritems(), start=1):
                print("\r%d.%d. %s" % (i, ind, dir1)),
                for dir2, cogs2 in cogDict.iteritems():
                    cogWeightDict[dir1][dir2] = \
                        cogSetWeight(cogs1 & cogs2, cogFreq, expCogReg)
            print
        UtilStore(cogWeightDictList, fname)

    return (cogDict, cogWeightDictList, taxaDict, taxDist)
예제 #42
0
    train_fn = fn[train]
    #n_class = len(np.unique(label[train]))
    n_class = 30
    '''
    c = AC(n_clusters=n_class, affinity='cosine', linkage='average')
    c.fit(train_fd)
    tmp = dd(list)
    for i,j in zip(c.labels_, train):
        tmp[i].append(j)
    for k,v in tmp.items():
        for vv in v:
            print k, input3[vv]
    '''
    c = KMeans(init='k-means++', n_clusters=n_class, n_init=10)
    c.fit(train_fn)
    tmp = DD(list)
    for i,j in zip(c.labels_,train):
        tmp[i].append(j)
    for k,v in tmp.items():
        for vv in v:
            pass
            #print k, input1[vv]

    ex = DD(list)
    dist = np.sort(c.transform(train_fn))
    for i,j,k in zip(c.labels_,train,dist):
        ex[i].append([j,k[0]])
        print i,k[0],input3[j]
    for i,j in ex.items():
        ex[i] = sorted(j, key=lambda x: x[-1])
def word_search(key: str, hist: defaultdict):
    freq = 0
    for v, k in hist.items():
        if v == key:
            return k
    return freq
def average_frequency(hist: defaultdict):
    sum = 0
    for freq in hist.values():
        sum += int(freq)
    return sum/len(hist.values())
cogDict, cogFreq, cogWeightDict, taxaDict = ccm.buildCogTaxaDict()
print ("cogDict len %d, taxaDict len %d" % (len(cogDict), len(taxaDict)))

dirCorrDict = UtilLoad(GENOME_CORR_DICT())
print ("dirCorrDict len %d" % len(dirCorrDict))

print("Building COG distances...")
cogDist = DefDict(dict)
for ordinal, (dir1, cs1) in enumerate(cogDict.iteritems(), start = 1):
    print("\r%d. %s" % (ordinal, dir1)),
    for dir2, cs2 in cogDict.iteritems():
        cogDist[dir1][dir2] = cogDistFunc(cs1, cs2)

print("\nBuilding average distances for TaxaTypes...")
# Genome dir -> dict of {taxaTypes -> avg COG distance to dir}
dirTaxaTypeDictDict = DefDict(lambda: DefDict(list))
for ordinal, dir1 in enumerate(taxaDict.keys(), start = 1):
    print("\r%d. %s" % (ordinal, dir1)),
    for dir2, taxa in taxaDict.iteritems():
        dirTaxaTypeDictDict[dir1][repr(taxa.type)].append(cogDist[dir1][dir2])

print("\nRebuilding dirTaxaTypeDictDict to get UtilNormDistribs...")
for dir, d in dirTaxaTypeDictDict.iteritems():
    # Find global weighted STD
    std = 0.
    totalLen = 0
    for taxaTypeStr, distList in d.iteritems():
        if len(distList) >= 2:
            val = np.std(distList, ddof = 1.)
            std += val * val * len(distList)
            totalLen += len(distList)
예제 #46
0
파일: type_lwe.py 프로젝트: Thunder1989/SDB
label = test_label
class_ = np.unique(train_label)
name = []
for i in input1:
    s = re.findall('(?i)[a-z]{2,}',i)
    name.append(' '.join(s))
cv = CV(analyzer='char_wb', ngram_range=(3,4))
test_fn = cv.fit_transform(name).toarray()
for b in bl:
    print b.score(test_fd,label)

n_class = 32/2
c = KMeans(init='k-means++', n_clusters=n_class, n_init=10)
c.fit(test_fn)
dist = np.sort(c.transform(test_fn))
ex = DD(list) #example id, distance to centroid
ex_id = DD(list) #example id for each C
ex_N = [] #number of examples for each C
for i,j,k in zip(c.labels_, xrange(len(test_fn)), dist):
    ex[i].append([j,k[0]])
    ex_id[i].append(int(j))
for i,j in ex.items():
    ex[i] = sorted(j, key=lambda x: x[-1])
    ex_N.append([i,len(ex[i])])
ex_N = sorted(ex_N, key=lambda x: x[-1],reverse=True) #sort cluster by density
nb_c = DD()
for exx in ex_id.values():
    exx = np.asarray(exx)
    for e in exx:
        nb_c[e] = exx[exx!=e]
nb_f = [DD(), DD(), DD()]
예제 #47
0
# Build a tree of TaxaTypes
taxaTypeTree = TaxaTypeTree(taxaDict)

# Set of all Taxa types on all levels
allTaxaTypes = taxaTypeTree.getAllTypesSet()
print("Length of allTaxaTypes %d" % len(allTaxaTypes))

# Build a dictionary: [dir][taxaType] -> UtilObject(mean, std,
# isAncest, distList), where
# mean - mean distance between this dir and all [other] dirs in this taxaType
# std - standard deviation, if applicable
# distList - list of all the distances
print("Building taxaTypeDistDict...")
taxaTypeDistDict = DefDict(dict)
taxaTypeAncDistDict = DefDict(lambda: [None] * (TaxaType.hierarchySize()+1))
for ind, dir in enumerate(taxaDict, start = 1):
    print("\r%u. %s" % (ind, dir)),
    d = taxaTypeDistDict[dir]
    listAnc = taxaTypeAncDistDict[dir]
    cogDistForDir = cogDist[dir]
    for currType in allTaxaTypes:
        dirs = taxaTypeTree.getDirSet(currType)
        if dir in dirs:
            ancestorList = True
            dirs.remove(dir)
        else:
            ancestorList = False
        distList = [cogDistForDir[x] for x in dirs]
        meanVal = 0.0
        stdVal = 0.0
예제 #48
0
    c = KMeans(init='k-means++', n_clusters=n_class, n_init=10)
    c.fit(c_fn)
    '''
    c = DPGMM(n_components=50, covariance_type='diag', alpha=1)
    c.fit(train_fd)
    c_labels = c.predict(train_fd)
    print '# of GMM', len(np.unique(c_labels))
    mu = c.means_
    cov = c._get_covars()
    c_inv = []
    for co in cov:
        c_inv.append(np.linalg.inv(co))
    e_pr = np.sort(c.predict_proba(train_fd))
    '''
    dist = np.sort(c.transform(c_fn))
    ex = DD(list) #example id, distance to centroid
    ex_id = DD(list) #example id for each C
    ex_N = [] #number of examples for each C
    #for i,j,k in zip(c_labels, train, e_pr):
    for i,j,k in zip(c.labels_, train, dist):
        ex[i].append([j,k[0]])
        ex_id[i].append(int(j))
    for i,j in ex.items():
        ex[i] = sorted(j, key=lambda x: x[-1])
        ex_N.append([i,len(ex[i])])
    ex_N = sorted(ex_N, key=lambda x: x[-1],reverse=True) #sort cluster by density

    #confidence of training ex
    label_pr = np.sort(md.predict_proba(data2[train]))
    cf_d = DD()
    for i,pr in zip(train, label_pr):