Ejemplo n.º 1
0
    def _set_parent_errors(self, trace):
        """
        Set parent_error flag for all transactions in branches finished with error in trace

        Parameters
        ----------
        trace : list
            List of transactions
        """
        errors = {}
        for transaction in trace:
            if "error" in transaction.keys():
                if transaction["transactionHash"] not in errors.keys():
                    errors[transaction["transactionHash"]] = trie.Trie()
                errors[transaction["transactionHash"]][
                    transaction["traceAddress"]] = True
        for transaction in trace:
            if transaction["transactionHash"] in errors.keys():
                prefix_exists = bool(
                    errors[transaction["transactionHash"]].shortest_prefix(
                        transaction["traceAddress"]))
                is_node = errors[transaction["transactionHash"]].has_key(
                    transaction["traceAddress"])
                if prefix_exists and not is_node:
                    transaction["parent_error"] = True
Ejemplo n.º 2
0
 def __init__(self, args):
     self.paths = args.paths
     self.max_num = int(args.max_num)
     self.final_trie = tri.Trie()
     self.trie_per_file = []
     self.__char_to_remove = re.compile('[,.!?*#();:\[\]{}]')
     self.count_words()
Ejemplo n.º 3
0
 def build_trie(self):
     """ building Trie from piece
     """
     Trie = pygtrie.Trie()
     for (key, score) in self.SentencePiece.get_pieces().items():
         Trie[key] = (key, score)
     self.Trie = Trie
Ejemplo n.º 4
0
 def __init__(self,
              language: str = "ru",
              mode: Mode = Mode.GRAPHEMES,
              raw_dict_path=None,
              trie_path=None,
              zalyzniak_dict=ZALYZNYAK_DICT,
              cmu_dict=CMU_DICT) -> None:
     self.data = pygtrie.Trie()  # type: Dict[str, Set[Stress]]
     self.raw_dict_path = raw_dict_path
     self.trie_path = trie_path
     if language == "ru" and mode == self.Mode.GRAPHEMES:
         self.__init_defaults(RU_GRAPHEME_STRESS_PATH,
                              RU_GRAPHEME_STRESS_TRIE_PATH)
         if not os.path.exists(self.raw_dict_path):
             from rupo.dict.zaliznyak import ZalyzniakDict
             ZalyzniakDict.convert_to_accent_only(zalyzniak_dict,
                                                  self.raw_dict_path)
     elif mode == self.Mode.PHONEMES and language == "en":
         self.__init_defaults(EN_PHONEME_STRESS_PATH,
                              EN_PHONEME_STRESS_TRIE_PATH)
         if not os.path.exists(self.raw_dict_path):
             CMUDict.convert_to_phoneme_stress(cmu_dict, self.raw_dict_path)
     else:
         assert False
     if not os.path.isfile(self.raw_dict_path):
         raise FileNotFoundError("Dictionary raw file not found.")
     if os.path.isfile(self.trie_path):
         self.load(self.trie_path)
     else:
         self.create(self.raw_dict_path, self.trie_path)
Ejemplo n.º 5
0
 def _read_file(self, path):
     """
     :param path: path of a file
     :return: Trie object with all counted word
     """
     trie = tri.Trie()
     counted_file = ''
     with open(path, 'r', buffering=DEFAULT_BUFFER_SIZE) as f:
         in_time_frame = False
         for line in f:
             dates_string = utils.find_legal_date(line)
             if dates_string:
                 try:
                     date = list(datefinder.find_dates(dates_string))[0]
                 except (IndexError, OverflowError):
                     date = None
                 if date is not None:
                     in_time_frame = self._is_timestamps_in_time_frame(
                         date, in_time_frame)
             if in_time_frame:
                 if not counted_file:
                     counted_file = path
                 for word in line.split():
                     word = self.__char_to_remove.sub('', word.lower())
                     if word:
                         if word in trie:
                             trie[word] += 1
                         else:
                             trie[word] = 1
     return path, trie
Ejemplo n.º 6
0
 def buildSearchTrie(self, choices):
     searchtrie = trie.Trie()
     for choice in choices:
         for token in self.tokenizeChoice(choice):
             if not searchtrie.has_key(token):
                 searchtrie[token] = []
             searchtrie[token].append(choice)
     return searchtrie
Ejemplo n.º 7
0
 def get_phoenetic_trie():
     dd = defaultdict(list)
     tree = trie.Trie()
     for k, vs in cmu.items():
         for v in vs:
             dd[','.join(v)].append(k)
     for k, v in dd.items():
         tree[k.split(',')] = v
     return tree, cmu
Ejemplo n.º 8
0
    def create_trie():
        tostring = (getattr(array.array, 'tobytes', None) or # Python 3
                    getattr(array.array, 'tostring'))  # Python 3

        trie = pygtrie.Trie()
        for x in range(100):
            y = tostring(array.array('h', range(x, 1000)))
            trie.update([(y, x)])
        return trie
Ejemplo n.º 9
0
 def __attrs_post_init__(self):
     self.words_queue = PriorityQueue(maxsize=0)
     if not self.words_tree:
         root = Path(__file__).parent
         word_list = (root / "words.txt").read_text().splitlines()
         self._words = [w.rstrip().upper() for w in word_list if 9 >= len(w.strip()) > 2]
         self.words_tree = pygtrie.Trie()
         for w in self._words:
             self.words_tree[w] = True
Ejemplo n.º 10
0
 def __init__(self, args):
     self._time_frames = utils.merge_intervals(args.time_frames)
     self._max_num = int(args.max_num)
     self.final_trie = tri.Trie()
     self.__trie_per_file = []
     self.__char_to_remove = re.compile('[=,.!?*#();:\[\]{}]')
     self._debug_mode = args.debug
     self._counted_word_files = set()
     self.count_words()
Ejemplo n.º 11
0
 def build_trie(self, seq_iter):
     trie = pygtrie.Trie()
     N = self.max_prefix_len
     for seq in seq_iter:
         for start in xrange(len(seq)):
             key = tuple(seq[start:start + N])
             trie[key] = 1
     self.trie = trie
     return trie
Ejemplo n.º 12
0
  def __init__(self, steps_per_second, num_velocity_bins, min_pitch, max_pitch,
               add_eos=False, ngrams=None):
    """Initialize a MidiPerformanceEncoder object.

    Encodes MIDI using a performance event encoding. Index 0 is unused as it is
    reserved for padding. Index 1 is unused unless `add_eos` is True, in which
    case it is appended to all encoded performances.

    If `ngrams` is specified, vocabulary is augmented with a set of n-grams over
    the original performance event vocabulary. When encoding, these n-grams will
    be replaced with new event indices. When decoding, the new indices will be
    expanded back into the original n-grams.

    No actual encoder interface is defined in Tensor2Tensor, but this class
    contains the same functions as TextEncoder, ImageEncoder, and AudioEncoder.

    Args:
      steps_per_second: Number of steps per second at which to quantize. Also
          used to determine number of time shift events (up to one second).
      num_velocity_bins: Number of quantized velocity bins to use.
      min_pitch: Minimum MIDI pitch to encode.
      max_pitch: Maximum MIDI pitch to encode (inclusive).
      add_eos: Whether or not to add an EOS event to the end of encoded
          performances.
      ngrams: Optional list of performance event n-grams (tuples) to be
          represented by new indices. N-grams must have length at least 2 and
          should be pre-offset by the number of reserved IDs.

    Raises:
      ValueError: If any n-gram has length less than 2, or contains one of the
          reserved IDs.
    """
    self._steps_per_second = steps_per_second
    self._num_velocity_bins = num_velocity_bins
    self._add_eos = add_eos
    self._ngrams = ngrams or []

    for ngram in self._ngrams:
      if len(ngram) < 2:
        raise ValueError('All n-grams must have length at least 2.')
      if any(i < self.num_reserved_ids for i in ngram):
        raise ValueError('N-grams cannot contain reserved IDs.')

    self._encoding = magenta.music.PerformanceOneHotEncoding(
        num_velocity_bins=num_velocity_bins,
        max_shift_steps=steps_per_second,
        min_pitch=min_pitch,
        max_pitch=max_pitch)

    # Create a trie mapping n-grams to new indices.
    ngram_ids = range(self.unigram_vocab_size,
                      self.unigram_vocab_size + len(self._ngrams))
    self._ngrams_trie = pygtrie.Trie(zip(self._ngrams, ngram_ids))

    # Also add all unigrams to the trie.
    self._ngrams_trie.update(zip([(i,) for i in range(self.unigram_vocab_size)],
                                 range(self.unigram_vocab_size)))
Ejemplo n.º 13
0
def _read_word_list(word_list_filepath: Path = DEFAULT_WORD_LIST_PATH) -> pygtrie.Trie:
    """
    :param Path word_list_filepath: Path to a file containing the list of valid words
    :return: Trie containing all valid words
    :rtype: pygtrie.Trie
    """
    word_list = pygtrie.Trie()
    with open(word_list_filepath, mode="r") as word_list_file:
        # Word list file must contain one word per line
        for word in word_list_file:
            word_list[word.strip()] = True
    return word_list
Ejemplo n.º 14
0
 def _read_file(self, path):
     """
     :param path: path of a file
     :return: Trie object with all counted word
     """
     trie = tri.Trie()
     with open(path, 'r', buffering=DEFAULT_BUFFER_SIZE) as f:
         for line in f:
             for word in line.split():
                 word = self.__char_to_remove.sub('', word.lower())
                 if word in trie:
                     trie[word] += 1
                 else:
                     trie[word] = 1
     return trie
Ejemplo n.º 15
0
def identical_subarray_trie(arr1, arr2):

    t = pygtrie.Trie()

    large = small = []
    if len(arr1) <= len(arr2):
        small, large = arr1, arr2
    else:
        small, large = arr2, arr1

    for item in small:
        print item
        t[item] = item

    print
    if t.values([1, 2, 3]):
        print
Ejemplo n.º 16
0
    def test_large_trie(self):
        """Test handling of large tries which would overflow stack."""
        tostring = (
            getattr(array.array, 'tobytes', None) or  # Python 3
            getattr(array.array, 'tostring'))  # Python 3

        trie = pygtrie.Trie()
        for x in range(100):
            y = tostring(array.array('h', range(x, 1000)))
            trie[y] = x

        # Plain iteration
        n = 0
        for _ in trie.iteritems():
            n += 1
        self.assertEqual(100, n)

        # Copy
        self.assertEqual(trie, copy.copy(trie))
        self.assertEqual(trie, copy.deepcopy(trie))
Ejemplo n.º 17
0
 def __init__(self, language: str="ru", mode: Mode=Mode.GRAPHEMES, raw_dict_path=None, trie_path=None,
              zalyzniak_dict=ZALYZNYAK_DICT, cmu_dict=CMU_DICT) -> None:
     self.data = pygtrie.Trie()
     self.raw_dict_path = raw_dict_path
     self.trie_path = trie_path
     if language == "ru" and mode == self.Mode.GRAPHEMES:
         self.__init_defaults(RU_GRAPHEME_STRESS_PATH, RU_GRAPHEME_STRESS_TRIE_PATH)
         if not os.path.exists(self.raw_dict_path):
             ZalyzniakDict.convert_to_accent_only(zalyzniak_dict, self.raw_dict_path)
     elif mode == self.Mode.PHONEMES and language == "en":
         self.__init_defaults(EN_PHONEME_STRESS_PATH, EN_PHONEME_STRESS_TRIE_PATH)
         if not os.path.exists(self.raw_dict_path):
             CMUDict.convert_to_phoneme_stress(cmu_dict, self.raw_dict_path)
     else:
         assert False
     if not os.path.isfile(self.raw_dict_path):
         raise FileNotFoundError("Не найден файл словаря.")
     if os.path.isfile(self.trie_path):
         self.load(self.trie_path)
     else:
         self.create(self.raw_dict_path, self.trie_path)
Ejemplo n.º 18
0
def populate_dictionary_trie():
    global is_dictionary_trie_populated
    global DICTIONARY_TRIE

    if is_dictionary_trie_populated == True and DICTIONARY_TRIE is not None:
        # To avoid re-populating the Trie if it has already been created and populated inMemory
        return DICTIONARY_TRIE

    # Trie datastructure for storing dictionary words and fast retrieval, and prefix matching
    DICTIONARY_TRIE = trie.Trie()

    # https://stackoverflow.com/a/6475407/3766839
    with open(os.path.join(get_script_path(), "dictionary.txt"), "r") as file:
        for word in file:
            word = str(word.upper()).rstrip(
            )  # stripping trailing newline characters of words from file, and making uppercase

            # Not including 2 In-frequent Letter words since they are pretty random (LA, FR, etc) and give bad outputs
            if len(word) <= MAX_WORD_LENGTH_DICTIONARY and len(
                    word) >= MIN_WORD_LENGTH_DICTIONARY:
                DICTIONARY_TRIE[word] = True

    is_dictionary_trie_populated = True
    return DICTIONARY_TRIE
Ejemplo n.º 19
0
file_list = {}
file_id = 0
for d, subdirs, files in os.walk(args.dir):
    for f in files:
        if pat.search(f):
            print('processing {} ...'.format(os.path.join(d, f)))
            file_list[file_id] = os.path.join(d, f)
            file_id = file_id + 1
            with open(os.path.join(d, f), 'r') as content_file:
                content = content_file.read()
                fm.push_back(content)
fm.build()

seen_clones = set()
clone_fragments = pygtrie.Trie()
clones = pygtrie.Trie()


def add_clone(seq, v):
    for item in clones.prefixes(seq):
        if item[1][0] <= v[0]:
            del clones[item[0]]
    clones[seq] = v
    for i in range(1, len(seq)):
        for item in clone_fragments.prefixes(seq[i:]):
            if item[1][0] <= v[0]:
                del clone_fragments[item[0]]
        clone_fragments[seq[i:]] = v

Ejemplo n.º 20
0
 def updateChoices(self, choices):
     self.choices = sorted(filter(None, choices))
     self.searchtree = trie.Trie()
Ejemplo n.º 21
0
def gen(cs, N, size):
    words = numpy.random.choice(cs, [N, size])
    t = pygtrie.Trie()
    for word in words:
        t[str(word)] = str(word)
    return t
Ejemplo n.º 22
0
def _load_suffixes():
    suffixes = trie.Trie()
    with open(os.path.join(RESOURCE_PATH, 'suffixes_list.txt'), 'r', encoding='utf-8') as fp:
        for suffix in fp.read().split('\n'):
            suffixes[suffix[::-1]] = suffix
    return suffixes
Ejemplo n.º 23
0
    def init_path_details(self, path, sg):
        print("going to initiate path details for: ", path)

        # to return:
        path_execs = []
        path_join_keys = []
        path_tries = []

        first_node = path[0]
        node_info = sg.nodes()[first_node]
        table = ALIAS_FORMAT.format(TABLE=node_info["real_name"],
                                    ALIAS=first_node)
        nodes_seen = set()

        if first_node not in self.init_sels:
            sels = ",".join(node_info["sels"])
            where_clause = ""
            if len(node_info["predicates"]) > 0:
                preds = " AND ".join(node_info["predicates"])
                where_clause = "WHERE " + preds
            exec_sql = FIRST_HOP_TMP.format(SELS=sels,
                                            TABLE=table,
                                            WHERE=where_clause)
            self.cursor.execute(exec_sql)
            outputs = self.cursor.fetchall()
            self.init_sels[first_node] = outputs
            print("computed first hop outputs: ", len(outputs))

        # for first node
        nodes_seen.add(first_node)
        path_execs.append(None)
        path_join_keys.append(None)

        # for rest of the path, compute join statements
        if self.use_tries:
            print("creating tries...")
            path_tries.append(None)

        for node_idx in range(1, len(path), 1):
            created_index = False
            node = path[node_idx]
            node_info = sg.nodes()[node]
            table = ALIAS_FORMAT.format(TABLE=node_info["real_name"],
                                        ALIAS=node)
            sels = ",".join(node_info["sels"])
            join_edges = list(nx.edge_boundary(sg, nodes_seen, {node}))
            assert len(join_edges) != 0

            nodes_seen.add(node)

            # FIXME: check triangle condition
            if self.use_tries:
                join = join_edges[0]
                path_execs.append(None)
                where_clause = ""
                if len(node_info["predicates"]) > 0:
                    preds = " AND ".join(node_info["predicates"])
                    where_clause = "WHERE " + preds
                exec_sql = FIRST_HOP_TMP.format(SELS=sels,
                                                TABLE=table,
                                                WHERE=where_clause)
                print("exec sql for trie: ", exec_sql)
                cur_join_col = sg[join[0]][join[1]][join[1]]
                other_col = sg[join[0]][join[1]][join[0]]
                print("cur join col: ", cur_join_col)
                print("other col: ", other_col)

                trie_idx = None
                for sel_i, sel in enumerate(node_info["sels"]):
                    if sel == cur_join_col:
                        trie_idx = sel_i
                assert trie_idx is not None
                path_join_keys.append([other_col])
                trie_key_name = node_info["sels"][trie_idx]
                sql_key = deterministic_hash(exec_sql + trie_key_name)
                if sql_key in self.trie_cache:
                    kl_start = time.time()
                    trie = self.trie_cache[sql_key]
                    print("loading trie {} from in memory klepto took: {}".
                          format(node,
                                 time.time() - kl_start))
                elif sql_key in self.trie_cache.archive:
                    # trie = None
                    kl_start = time.time()
                    trie = self.trie_cache.archive[sql_key]
                    print("loading trie {} from klepto took: {}".format(
                        node,
                        time.time() - kl_start))
                else:
                    st = time.time()
                    self.cursor.execute(exec_sql)
                    outputs = self.cursor.fetchall()

                    trie = pygtrie.Trie()
                    for out in outputs:
                        if str(out[trie_idx]) not in trie:
                            trie[str(out[trie_idx])] = []
                        trie[str(out[trie_idx])].append(out)
                    trie_time = time.time() - st
                    print("trie for {}, len: {}, took: {}".format(
                        node, len(outputs), trie_time))
                    self.total_trie_time += trie_time

                    if trie_time > TRIE_USE_THRESHOLD:
                        trie = None
                        self.trie_cache.archive[sql_key] = trie
                    elif trie_time > TRIE_ARCHIVE_THRESHOLD \
                        and trie_time < TRIE_USE_THRESHOLD:
                        self.trie_cache.archive[sql_key] = trie

                # no matter what, store in memory cache so we avoid reloading
                # it from archive in the next path
                self.trie_cache[sql_key] = trie
                path_tries.append(trie)
                if trie is None:
                    path_join_keys[-1] = None
            else:
                path_tries.append(None)
                path_join_keys.append(None)

            if path_tries[-1] is None:
                fkey_conds = []
                cur_join_cols = []
                index_cols = []

                join = join_edges[0]
                assert node == join[1]
                # a value for this column would already have been selected
                other_col = sg[join[0]][join[1]][join[0]]
                cur_join_cols.append(other_col)
                # other_val = vals[other_col]
                cur_col = sg[join[0]][join[1]][join[1]]

                col_name = cur_col.split(".")[1]
                if col_name not in index_cols:
                    index_cols.append(col_name)
                other_col_key = "X" + other_col + "X"
                cond = cur_col + " = " + other_col_key
                fkey_conds.append(cond)

                # path_join_keys.append(cur_join_cols)
                assert path_join_keys[-1] is None
                path_join_keys[-1] = cur_join_cols
                assert len(fkey_conds) != 0

                # FIXME: check math
                fkey_conds += node_info["predicates"]
                fkey_cond = " AND ".join(fkey_conds)
                for col in node_info["pred_cols"]:
                    col_name = col.split(".")[1]
                    if col_name not in index_cols:
                        index_cols.append(col_name)

                exec_sql = NEXT_HOP_TMP.format(FKEY_CONDS=fkey_cond,
                                               TABLE=table,
                                               SELS=sels)
                # assert path_execs[-1] is None
                if path_execs[-1] is not None:
                    print(exec_sql)
                    print(path_execs)
                    pdb.set_trace()

                path_execs[-1] = exec_sql

        return path_execs, path_join_keys, path_tries
Ejemplo n.º 24
0
    def __init__(self, words):

        self.nodes = trie.Trie()
        self.maketrie(words)
Ejemplo n.º 25
0
 def __init__(self):
     self.tree = trie.Trie()
Ejemplo n.º 26
0
def main_algo(features, tweetid, lastclusterid):

    fvecs, freqdict = tfidf_all.get_tfidf_freqdict(features)

    # Creating random vectors
    num_randvecs = 13
    random_vectors = randomvecs.getVecs(len(freqdict), num_randvecs)

    # Initialising prefix trees
    a = []
    b = []
    prime = 13
    P = []
    # modP = int(input("Enter number of permutations to be used : "))
    modP = 20
    for i in range(modP):
        atemp = random.uniform(1, prime)
        btemp = random.uniform(0, prime)
        a.append(atemp)
        b.append(btemp)
        P.append(pygtrie.Trie())

    index = 0
    wordindexmap = {}
    for key in freqdict.keys():
        wordindexmap[key] = index
        index = index + 1

    # MAIN TWEET LOOP

    tweetclustermap = {}
    clusterdict = {}
    for fvec in fvecs:
        tweetsign = signature.getSign(fvec, random_vectors, wordindexmap)

        # Insert tweet signature in prefix tree and find its nearest neighbor in that tree
        nearestNeighbours = []
        for i in range(modP):
            signPerm = [None] * len(tweetsign)
            for x in range(len(tweetsign)):
                ind = int(a[i] * x + b[i]) % prime
                signPerm[x] = tweetsign[ind]

            if P[i].has_key(signPerm):
                P[i][signPerm].append(tweetid)
            else:
                P[i][signPerm] = [tweetid]

            neighbor, hdist = nearest_neighbor.getNN(signPerm, P[i])

            if (neighbor == None): None
            elif hdist == 0:
                neighbor.remove(tweetid)
                nearestNeighbours.append((neighbor, hdist))
            elif hdist == 1:
                nearestNeighbours.append((neighbor, hdist))
            elif (hdist > 1):
                templist = []
                for item in neighbor:
                    templist += item[1]
                nearestNeighbours.append((templist, hdist))

        mindist = len(signPerm) + 10
        closestNeighbors = []
        for pair in nearestNeighbours:
            if pair[1] <= mindist:
                mindist = pair[1]

        for pair in nearestNeighbours:
            if pair[1] == mindist:
                for i in range(len(pair[0])):
                    if not pair[0][i] in closestNeighbors:
                        closestNeighbors.append(pair[0][i])

        # T = float(input("Enter the similarity threshold : "))
        T = 0.05
        tweetclustermap[0] = 0
        clusterdict[0] = [0]

        for cneighbor in closestNeighbors:
            if (similarity.cosine_similarity(fvec, fvecs[cneighbor]) >= T):
                if (tweetid in tweetclustermap.keys()):
                    if (not (tweetclustermap[tweetid]
                             == tweetclustermap[cneighbor])):
                        tweetclustermap[tweetid] = tweetclustermap[cneighbor]
                        clusterdict[tweetclustermap[cneighbor]].append(tweetid)
                else:
                    tweetclustermap[tweetid] = tweetclustermap[cneighbor]
                    clusterdict[tweetclustermap[cneighbor]].append(tweetid)
            else:
                if (not (tweetid in tweetclustermap.keys())):
                    tweetclustermap[tweetid] = lastclusterid + 1
                    clusterdict[lastclusterid + 1] = [tweetid]
                    lastclusterid += 1

        tweetid = tweetid + 1
    return clusterdict, fvecs, freqdict
Ejemplo n.º 27
0
    "Turkmenistan", "Tuvalu", "Uganda", "Ukraine", "United Arab Emirates",
    "United Kingdom", "United States", "Uruguay", "Uzbekistan", "Vanuatu",
    "Vatican City", "Venezuela", "Vietnam", "Yemen", "Zambia", "Zimbabwe",
    "↑ UN member states and observer states ↑", "", "↓ Other states ↓",
    "Abkhazia", "Artsakh", "Cook Islands", "Kosovo", "Niue", "Northern Cyprus",
    "Sahrawi Arab Democratic Republic", "Somaliland", "South Ossetia",
    "Taiwan", "Transnistria", "↑ Other states ↑"
]

us_states = [
    "Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado",
    "Connecticut", "Delaware", "Florida", "Georgia", "Hawaii", "Idaho",
    "Illinois", "Indiana", "Iowa", "Kansas", "Kentucky[E]", "Louisiana",
    "Maine", "Maryland", "Massachusetts", "Michigan", "Minnesota",
    "Mississippi", "Missouri", "Montana", "Nebraska", "Nevada",
    "New Hampshire", "New Jersey", "New Mexico", "New York", "North Carolina",
    "North Dakota", "Ohio", "Oklahoma", "Oregon", "Pennsylvania",
    "Rhode Island", "South Carolina", "South Dakota", "Tennessee", "Texas",
    "Utah", "Vermont", "Virginia", "Washington", "West Virginia", "Wisconsin",
    "Wyoming"
]

x = trie.Trie()
output = {}
for country in countries:
    for i in country.split():
        if not x.has_key(i):
            x[i] = []
        x[i].append(country)

a = 10
Ejemplo n.º 28
0
 def __init__(self):
     self.data = pygtrie.Trie()
Ejemplo n.º 29
0
 def train(self):
     print "counting training doc ..."
     pattern = re.compile(u'[\\s\\d,.<>/?:;\'\"[\\]{}()\\|~!@#$%^&*\\-_=+a-zA-Z,。《》、?:;“”‘’{}【】()…¥!—┄-]+')
     # candidates = hat_trie.Trie()
     candidates = pygtrie.Trie()
     self.doc_length = 0
     # 注意,doc 初始为一个空格,doc[0] 位置的字符不作为每次遍历的目标
     # 而只是作为 doc[1] 的左邻居,这样避免训练下一个批次时,丢掉了起点的左邻
     doc = u' '
     line_cnt = 0
     with codecs.open(self.doc, 'r', 'utf-8') as f:
         for line in f:
             line = re.sub(pattern, '', line)
             self.doc_length += len(line)
             doc += line
             line_cnt += 1
             if line_cnt % 10000 == 0:
                 print "{} lines processed".format(line_cnt)
                 # if line_cnt == 110000:
                 #     break
             # 每 batch_size 个汉字处理一次
             if len(doc) < self.batch_size:
                 continue
             length = len(doc)
             # 从 1 开始遍历,目的是保留上次遍历留下来的左邻居
             # 不取到 length,目的是保证从这一轮循环的每个起点起,
             # 都能取到 self.max_word 长度的字串
             # 比如 length = 10 ==> 0 1 2 3 4 5 6 7 8 9
             # self.max_word = 5 ==> i 最多取到 4,这样可以取到
             # "45678" 字串,而且能取到右邻居 ‘9’
             for i in xrange(1, length - self.max_word):
                 for j in xrange(i + 1, i + self.max_word + 1):
                     text = doc[i: j]
                     if text not in candidates:
                         candidates[text] = Word(text)
                     candidates[text].meet(doc[i - 1: i], doc[j: j + 1])
             # 本批次处理完毕,准备处理下一批次,那么前面处理过的字符可以删掉了
             # 但是,最后的一个字符不能删,因为要作为下一个批次的 doc[0],即左邻居
             doc = doc[length - self.max_word - 1:]
     # 循环完毕,那么 doc 中剩下一些不到 self.batch_size 长的字符,需要做一下处理
     length = len(doc)
     # 同样,跳过 doc[0]
     for i in xrange(1, length):
         for j in xrange(i + 1, min(i + self.max_word + 1, length + 1)):
             text = doc[i: j]
             if text not in candidates:
                 candidates[text] = Word(text)
             candidates[text].meet(doc[i - 1: i], doc[j: j + 1])
     # 计算 freq 和左右邻熵
     print "making statistics ..."
     # hat_trie has not iteritems() func
     # for text in candidates.iterkeys():
     #     candidates[text].statistics(self.doc_length)
     for _, word in candidates.iteritems():
         word.statistics(self.doc_length)
     # 至此,全部 freq 和左右熵都被计算完毕,可以计算凝固度、内部熵,并得到最终得分了
     print "calculating aggregations ...."
     # for text in candidates.iterkeys():
     #     if len(text) < 2:
     #        continue
     #     word = candidates[text]
     for text, word in candidates.iteritems():
         if len(text) < 2:
             continue
         word.aggreg = Algorithm.aggregation(word, candidates)
         word.inner = Algorithm.inner_entropy(word, candidates)
         word.score = word.aggreg + min(word.left, word.right) - word.inner
     # 到这里,单个的词已经无用了,后面词库只记录双字以上的词
     # self.words = sorted([candidates[text] for text in candidates.iterkeys() if len(text) > 1], key=lambda v: v.freq, reverse=True)
     self.words = sorted([word for text, word in candidates.iteritems() if len(text) > 1], key=lambda v: v.freq, reverse=True)
     # 一些统计数据
     total = float(len(self.words))
     print "Avg len: ", sum([len(w.text) for w in self.words]) / total
     print "Avg freq: ", sum([w.freq for w in self.words]) / total
     print "Avg left ent: ", sum([w.left for w in self.words]) / total
     print "Avg right ent: ", sum([w.right for w in self.words]) / total
     print "Avg aggreg: ", sum([w.aggreg for w in self.words]) / total
     print "Avg inner ent: ", sum([w.inner for w in self.words]) / total
     print "Avg score: ", sum([w.score for w in self.words]) / total
     # 保存当前结果
     with codecs.open("candidates_statistics.csv", "w", "utf-8") as f:
         for w in self.words:
             f.write(u"{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(w.text, w.freq, w.left, w.right, w.aggreg, w.inner, w.score))
Ejemplo n.º 30
0
# -*- coding: utf-8 -*-
import re
import django
import codecs
import json
import unicodecsv
import pygtrie
from collections import defaultdict
django.setup()
from sefaria.model import *

maxrabbilen = 0
with open("RabbisNames.csv", 'rb') as fin:
    tonorabbiscsv = unicodecsv.DictReader(fin)
    tonorabbis = pygtrie.Trie()
    for row in tonorabbiscsv:
        rabbiName = u""
        for i in range(1, 11):
            tempName = row[u"Name{}".format(i)]
            if not tempName:
                break
            if i > 1:
                rabbiName += u" "
            rabbiName += tempName
        rabbiName = rabbiName.replace(u"ר'", u"רבי")
        if len(rabbiName) > maxrabbilen:
            maxrabbilen = len(rabbiName)
        tonorabbis[rabbiName] = 0

    pass