def build_test_data(): dawg.CompletionDAWG(['f', 'bar', 'foo', 'foobar']).save('dev_data/small/completion.dawg') dawg.CompletionDAWG([]).save('dev_data/small/completion-empty.dawg') bytes_data = (('foo', b'data1'), ('bar', b'data2'), ('foo', b'data3'), ('foobar', b'data4')) dawg.BytesDAWG(bytes_data).save('dev_data/small/bytes.dawg') record_data = (('foo', (3, 2, 256)), ('bar', (3, 1, 0)), ('foo', (3, 2, 1)), ('foobar', (6, 3, 0))) dawg.RecordDAWG(str(">3H"), record_data).save('dev_data/small/record.dawg') int_data = {'foo': 1, 'bar': 5, 'foobar': 3} dawg.IntDAWG(int_data).save('dev_data/small/int_dawg.dawg') dawg.IntCompletionDAWG(int_data).save( 'dev_data/small/int_completion_dawg.dawg') dawg.DAWG(TestPrediction.DATA).save('dev_data/small/prediction.dawg') dawg.RecordDAWG(str("=H"), [ (k, (len(k), )) for k in TestPrediction.DATA ]).save('dev_data/small/prediction-record.dawg') create_dawg().save('dev_data/large/dawg.dawg') create_bytes_dawg().save('dev_data/large/bytes_dawg.dawg') create_record_dawg().save('dev_data/large/record_dawg.dawg') create_int_dawg().save('dev_data/large/int_dawg.dawg')
def createIndex(self): ''' Truncated Deletion Neighborhoods. Like the deletion neighborhood technique above, except each dictionary entry is truncated to k characters Every entry in the index therefore represents a range of dictionary words with that prefix. k = 7 and delta = 3 gets a index size roughly the same size as the dictionary widh identical running time to full-size deletion neighborhood. k = 6 and delta = 3 gets an index size less than the size of dictionary at the cost of apprx double runtime Investigate: using different Ks for different words Changes from full word algorithm: make a trie instead. to search for substrings of length i, we do a traversal of the last trie node (the trie tool handles this automatically) ''' prefixes = self.getRanges() # takes the form {prefix:rangeid} # each range is an array of length 2, [index, size] # turn the arrays into tuples, and use them as the values for the trie # first: get all of the deletion neighborhood for the keys, arrange # into two arrays for feeding into the trie keys = [] values = [] fmt = "<HH" # little-endian two unsigned short tuple Max number and # size of ranges, 65000 # if that's not enough, use I for unsigned int i = 0 for prefix in prefixes.keys(): substringset = n_deletion_neighborhood(prefix, self.delta) # get deletion neighborhood for sub in substringset: # lemma 3.2.5 in order to index we only need sequences of # length k-ð # TODO: decide whether or not to make this line != or > # > should result in a bigger index but somehow doesn't? # I'm confused. # != causes loss of recall for words shorter than k-ð if len(sub) > self.k - self.delta or (len(sub) < self.minimum_prefix): continue keys.append(sub) values.append(tuple(self.ranges[prefixes[prefix]])) i += 1 try: self.index = dawg.RecordDAWG(fmt, zip(keys, values)) except: print("Big dictionary, trying to index with 4 bytes rather than 2") self.index = dawg.RecordDAWG("<II", zip(keys, values)) del prefixes
def __sents__(self): encoding = self._encoding sentencizer = self._prep.sentencizer clean = self._clean path = self._input if self.loadas == 'txt' and self._path: path = datapath(self._path, datadir=self.datadir, ext=".tagged.txt").full if os.path.exists(path): encoding = 'utf-8' sentencizer = None clean = None stream = Stream(path, encoding=encoding) self.encoding = stream._encoding for num, sent in enumerate(stream(sentencizer, clean)): tagged_sent = TaggedSentence(sent.strip(), num, self._prep, self._filters) lemmas = tagged_sent.lemmas() # в этот словарь попадают все леммы, # так как здесь ничего не фильтруется self._vocab += FreqDist(lemmas) self._nwords += tagged_sent.nwords self._sents.append(tagged_sent) #self._words.extend(tagged_sent.words()) yield tagged_sent data = ((token.word, (token.nsent, token.idx)) for sent in self.sents() for token in sent.tokens(lower=True)) self._trie = dawg.RecordDAWG(">IH", data)
def _build_dawg(self) -> dawg.RecordDAWG: words = self._filter_words(self._words) freqs = set(freq for _, (_, _, freq) in words) freq_to_index = self._quantize_freqs(freqs) words = ((word, (freq_to_index[freq], lem_rule, gr_val)) for word, (gr_val, lem_rule, freq) in words) return dawg.RecordDAWG('>HHH', words)
def handle(self, *args, **options): print 'Emptying table...' Term.objects.all().delete() for timeframe, dates in TIMEFRAMES.items(): print 'Retrieving documents for timeframe {}...'.format(timeframe) exclude_dist = Distribution.objects.exclude( name='Landelijk').values_list('id', flat=True) date_range = daterange2dates(dates) total_documents = count_search_results(settings.ES_INDEX, settings.ES_DOCTYPE, None, date_range, exclude_dist, [], []).get('count') print 'Total documents: {}'.format(total_documents) sets = document_id_chunks(10000, settings.ES_INDEX, settings.ES_DOCTYPE, None, date_range, dist=exclude_dist) print 'Counting terms...' counter = Counter() for n, s in enumerate(sets): start_time = time.time() counter += termvector_wordcloud(settings.ES_INDEX, settings.ES_DOCTYPE, s, min_length=2, add_freqs=False) print 'Completed set {} in {} seconds...'.format( n + 1, time.time() - start_time) print 'Calculating IDFs...' terms = [] for term, count in counter.items(): if count > 1: # don't add single occurrences idf = math.log10(total_documents / float(count)) terms.append( Term(timeframe=timeframe, word=term, count=count, idf=idf)) print 'Transferring to database...' Term.objects.bulk_create(terms, batch_size=10000) print 'Creating RecordDAWG' d = dawg.RecordDAWG( '<d', zip([t.word for t in terms], [(t.idf, ) for t in terms])) d.save(os.path.join(settings.PROJECT_PARENT, timeframe + '.dawg')) """ Test code below.
def __init__(self, filename, featname, format=None): import dawg if format is None: self.data = dawg.CompletionDAWG() else: self.data = dawg.RecordDAWG(format) self.data.load(filename) self.filename = filename super(DAWGGlobalFeature, self).__init__(self.data, featname)
def __init__(self, value_format: str = 'h', threshold: float = 0.8, num_perm: int = 128, num_part: int = 32, tokenizer: Tokenizer = Tokenizer('zh')): self.value_format = value_format self.threshold = threshold self.num_perm = num_perm self.num_part = num_part self.tokenizer = tokenizer self.lsh = MinHashLSHEnsemble(threshold=self.threshold, num_perm=self.num_perm) self.record_dawg = dawg.RecordDAWG(self.value_format)
def loaddawg(self, name, path=None): path_ = path or datapath(self._path, datadir=self.datadir).short path = '{}.{}.dawg'.format(path_, name) if self._validpath(path): if self.verbose: print('loading dawg:'.ljust(16), path.replace(nlptk.MODULEDIR, '..')) d = dawg.RecordDAWG(">IH") obj = d.load(path) else: obj = None return obj
def __init__(self, dict_path: str): with TemporaryDirectory('dict') as temp_dir: with ZipFile(dict_path) as zip_file: zip_file.extractall(temp_dir) self._dawg = dawg.RecordDAWG('>HHH') self._dawg.load(os.path.join(temp_dir, 'dict.dawg')) with open(os.path.join(temp_dir, 'dict.info'), 'rb') as f: self._categories, self._grammemes_mappings, self._grammar_value_mapping, self._lemmatize_rule_mapping, \ self._alphabet, self._similar_letters, self._quantized_freqs_mapping = pickle.load(f) self._similar_letters_replacements = self._compile_replacements() self._grammemes_matrix = self._build_grammemes_matrix()
def to_dawg(df, columns=None, format=None): """ Encode ``pandas.DataFrame`` with GeoNames data (loaded using :func:`read_geonames` and maybe filtered in some way) to ``dawg.DAWG`` or ``dawg.RecordDAWG``. ``dawg.DAWG`` is created if ``columns`` and ``format`` are both None. """ import dawg if columns is None: assert format is None df = _split_names_into_rows(df) return dawg.CompletionDAWG(iter(df.name)) return dawg.RecordDAWG(format, _iter_geonames_items(df, columns))
def add(self, text_list: List[str], value_list: List[tuple]): len_text = len(text_list) len_value = len(value_list) assert len_text == len_value data = {} entries = [] for i, text in enumerate(text_list): entry = self.text_to_lsh_entry(text) key = entry[0] if key in data: continue value = value_list[i] self.__check_value_format(value) data[key] = value entries.append(entry) self.lsh.index(entries) self.record_dawg = dawg.RecordDAWG(self.value_format, data.items())
def normalize_cloud(cloud_data, idf_timeframe=''): """ Normalizes cloud data: - if necessary, calculates the tf-idf-scores - sort and return the maximum allowed number of words """ # If IDF is set, multiply term frequencies by inverse document frequencies if idf_timeframe: d = dawg.RecordDAWG('<d') d.load(os.path.join(settings.PROJECT_PARENT, idf_timeframe + '.dawg')) result = [{ 'term': t, 'count': c, 'tfidf': round(tfidf(d, t, c), 2) } for t, c in cloud_data.items()] result = sorted(result, key=lambda k: k['tfidf'], reverse=True) else: result = [{'term': t, 'count': c} for t, c in cloud_data.items()] result = sorted(result, key=lambda k: k['count'], reverse=True) return result[:settings.WORDCLOUD_MAX_WORDS]
def _filter_words(self, words: Iterable[Tuple[str, Tuple[int, int, float]]]): base_dawg = dawg.RecordDAWG('>HHf', words) filtered_words = [] for key in set(base_dawg.keys()): values = sorted(base_dawg.get_value(key), key=lambda x: x[0]) prev_val, prev_lemmatize_rule = None, None prev_val_freq = 0. for val, lemmatize_rule, freq in values: if val == prev_val and lemmatize_rule == prev_lemmatize_rule: prev_val_freq += freq else: if prev_val is not None: filtered_words.append( (key, (prev_val, prev_lemmatize_rule, prev_val_freq))) prev_val, prev_lemmatize_rule, prev_val_freq = val, lemmatize_rule, freq if prev_val is not None: filtered_words.append( (key, (prev_val, prev_lemmatize_rule, prev_val_freq))) return filtered_words
def test_record_dawg_items_values(self, word, prediction): d = dawg.RecordDAWG(str("=H"), self.LENGTH_DATA) assert d.similar_item_values(word, self.REPLACES) == prediction
def __init__( self, intake, prep: Prep = None, clean: TextCleaner = None, filters: TokenFilter = None, inplace=False, datadir=None, encoding=chardetector, verbose=True, rewrite=False, loadas="pickle", saveas=("txt", "pickle"), input='filename' # str {'filename', 'file', 'text'} ): self._path = '' self.filename = '' self.name = '' self.inplace = inplace self.verbose = verbose self.rewrite = rewrite self.loadas = loadas self.saveas = saveas self.encoding = 'unknown' if not datadir: self.datadir = os.path.join( os.path.abspath(os.path.dirname(__file__)), "data") else: self.datadir = os.path.abspath(datadir) self._encoding = None self._nwords = 0 self._sents = [] self._vocab = FreqDist() self._trie = dawg.RecordDAWG(">IH") if input == 'filename': self._path = intake #self.filename = os.path.basename(os.path.splitext(self._path)[0]) self.filename = os.path.basename(self._path) self.name = os.path.splitext(self.filename)[0] self._encoding = encoding self._input = intake if not self.rewrite: if self.loadas == 'pickle': self._sents = self.loadpickle('sents') or [ ] # all sentences from the text self._vocab = self.loadpickle('vocab') or FreqDist( ) # all unique normalized words from the text # итеративная загрузка словаря идет несколько секунд идет, поэтому быстрее # (за доли секунды) прочитать его из pickle #for sent in self._sents: # self._vocab += FreqDist(sent.lemmas()) self._trie = self.loaddawg('trie') or dawg.RecordDAWG( ">IH") # prefix tree elif input == "text": self._input = io.StringIO(intake) self._path = '' self.filename = self._input.__class__.__name__ self.name = self.filename elif input == "file": self._input = intake self._path = '' self.filename = self._input.__class__.__name__ self.name = self.filename if self._sents: self._nwords = sum(map(lambda s: s.nwords, self._sents)) self._prep = prep self._clean = clean self._filters = filters self._iter = self.__sents__() # close the generator if data is loaded if self._sents: self._iter.close() if self.inplace: if not self._sents: list(self._iter)
def create_record_dawg(): words = words100k() values = [[len(word)] for word in words] return dawg.RecordDAWG(str('<H'), zip(words, values))
def dawg(self): return dawg.RecordDAWG(">3H", self.STRUCTURED_DATA)