def gen_distance_vec(tweet, trigger_word): trig = tk.tokenize(trigger_word) words = tk.tokenize(tweet) pos = -1 for i in range(len(words)): if trig[len(trig) // 2] == words[i]: pos = i break first_p, last_p = -1, -1 for i in range(len(words)): if trig[0] == words[i]: first_p = i break if first_p == -1: for i in range(len(words)): if trig[0] in words[i] or words[i] in trig[0]: first_p = i break for i in range(len(words) - 1, -1, -1): if trig[-1] == words[i]: last_p = i break if last_p == -1: for i in range(len(words)): if trig[-1] in words[i] or words[i] in trig[-1]: last_p = i break return [i - pos for i in range(len(words))], [first_p, last_p]
def normalize_data(stem = True): global contexts for word in words: # converting each sense-definition pair to sense-normalized_definition_tokens words[word] = map(lambda pair: [pair[0], tokenize(pair[1], stem)], words[word]) # Normalizing contexts as well similarly contexts = map(lambda triple: [triple[0], triple[1], tokenize(triple[2], stem)], contexts)
def test_paragraph_markers() -> None: s = "[[Stutt setning.]][[]][[Önnur setning.]]" # 012345678901234567890123456789012345678901234567 # ^^^ ^ ^^ ^ ^ ^ ^ ^ ^^ # x toks = tokenizer.parse_tokens(s) char_indexes, byte_indexes = tokenizer.calculate_indexes(toks) assert char_indexes == [0, 2, 7, 15, 16, 18, 20, 22, 24, 29, 37, 38] assert byte_indexes == [0, 2, 7, 15, 16, 18, 20, 22, 24, 30, 38, 39] toks = tokenizer.parse_tokens(s) char_indexes, byte_indexes = tokenizer.calculate_indexes(toks, last_is_end=True) assert char_indexes == [0, 2, 7, 15, 16, 18, 20, 22, 24, 29, 37, 38, 40] assert byte_indexes == [0, 2, 7, 15, 16, 18, 20, 22, 24, 30, 38, 39, 41] # The tokenize functions does stuff to paragraph markers. Test that the # indexes are properly calculated after that. # Note that the text of the dropped empty paragraph markers disappears. s = "[[Stutt setning.]][[]][[Önnur setning.]]" # 012345678901234567890123456789012345678901234567 # ^ ^ ^ ^^ ^ ^ ^ ^^ # x toks = tokenizer.tokenize(s) char_indexes, byte_indexes = tokenizer.calculate_indexes(toks) assert char_indexes == [0, 2, 7, 15, 16, 18, 24, 29, 37, 38] assert byte_indexes == [0, 2, 7, 15, 16, 18, 24, 30, 38, 39] toks = tokenizer.tokenize(s) char_indexes, byte_indexes = tokenizer.calculate_indexes(toks, last_is_end=True) assert char_indexes == [0, 2, 7, 15, 16, 18, 24, 29, 37, 38, 40] assert byte_indexes == [0, 2, 7, 15, 16, 18, 24, 30, 38, 39, 41]
def main(): # takes in last argument given from console argument = sys.argv[-1] # get the file path or list of files we need, also return the name we will call our .asm file files = find_files(argument) # open the file(s) and start writing into nocomments.out for i in files: # create the nocomments.out file we will use to parse through, # each time it loops, it will erase and create a blank # nocomments.out file open('nocomments.out', 'w+').close() # run the .jack file through the strip comments to generate stripcomments(i) #now we have the .jack file with no comments #get the name of the file name = i.split('/')[-1] name = name.partition(".")[0] #tokenize to create the <name>T.xml file tokenize(name)
def __init__(self, id, url, title, text): self.id = id.lower() self.url = url.lower() self._title = title self.title = list(tokenizer.tokenize(title.lower())) self._text = text self.text = list(tokenizer.tokenize(text.lower()))
def build_vocab(file, threshold, wiki_file=None): """Build a simple vocabulary wrapper.""" captions = pd.read_csv(file, encoding="utf-8").text.values counter = Counter() for i, caption in enumerate(tqdm(captions)): tokens = tokenize(caption) counter.update(tokens) # if (i + 1) % 1000 == 0: # print("[{}/{}] Tokenized the captions.".format(i + 1, len(captions))) if wiki_file is not None: with open(wiki_file) as f: # ファイルのサイズを使用しプログレスバーを作成 for line in tqdm(f): tokens = tokenize(line) counter.update(tokens) # If the word frequency is less than 'threshold', then the word is discarded. words = [word for word, cnt in counter.items() if cnt >= threshold] # Create a vocab wrapper and add some special tokens. vocab = Vocabulary() vocab.add_word("<pad>") vocab.add_word("<start>") vocab.add_word("<end>") vocab.add_word("<unk>") # Add the words to the vocabulary. for i, word in enumerate(words): vocab.add_word(word) return vocab
def parseFiles_important(html: str): '''creates and returns a list of tokens found in headers, bold, and emphasis''' soup = BeautifulSoup(html, "lxml") '''a list of head tag tokens (this one includes stop words)''' heads = soup.find_all(re.compile('head')) list_of_heads = tokenizer.tokenize(" ".join( [head.get_text() for head in heads])) '''a list of heading tag tokens''' headers = soup.find_all(re.compile('^h[1-6]$')) list_of_headers = tokenizer.tokenize_remove_stopwords(" ".join( [header.get_text() for header in headers])) '''a list of title tokens (this one includes stop words)''' titles = soup.find_all(re.compile('title')) list_of_titles = tokenizer.tokenize(" ".join( [title.get_text() for title in titles])) '''a list of bold''' bolds = soup.find_all(re.compile('b')) list_of_bolds = tokenizer.tokenize_remove_stopwords(" ".join( [bold.get_text() for bold in bolds])) '''a list of strong''' strongs = soup.find_all(re.compile('strong')) list_of_strongs = tokenizer.tokenize_remove_stopwords(" ".join( [strong.get_text() for strong in strongs])) '''a list of emphasis tags''' emphasis = soup.find_all(re.compile('em')) list_of_emphasis = tokenizer.tokenize_remove_stopwords(" ".join( [empha.get_text() for empha in emphasis])) return list_of_heads + list_of_headers + list_of_titles + list_of_bolds + list_of_strongs + list_of_emphasis
def calculate_wmd_scores(references, candidates, wmd_model): ''' Calculate Word Mover's Distance for each (reference, candidate) pair in a list of reference texts and candidate texts. The lower the distance, the more similar the texts are. Parameters ---------- references : list Input texts candidates : list Output texts (e.g. from a style transfer model) wmd_model : gensim.models.word2vec.Word2Vec Trained Word2Vec model Returns ------- wmd_scores : list WMD scores for all pairs ''' wmd_scores = [] for i in range(len(references)): wmd = wmd_model.wv.wmdistance(tokenize(references[i]), tokenize(candidates[i])) wmd_scores.append(wmd) return wmd_scores
def test_till_calculate_plagiarism_score(): origin_text = 'the big cat is sleeping' susp_text = 'the cat is big' origin_tokens = tokenize(origin_text) susp_tokens = tokenize(susp_text) print(f'Raw text: {origin_text}') print(f'Tokenized text: {origin_tokens}\n\n') lcs_lenght = main.find_lcs_length(origin_tokens, susp_tokens, plagiarism_threshold=0.0) print('A length of the longest common subsequence for \n\n' f'{origin_text} \n\nand \n\n{susp_text}: \n\n{lcs_lenght} \n') matrix = main.fill_lcs_matrix(origin_tokens, susp_tokens) print('A matrix:') print(*matrix, sep='\n', end='\n\n') longest_lcs = main.find_lcs(origin_tokens, susp_tokens, matrix) print(f'The longest common subsequence: {longest_lcs}') score = main.calculate_plagiarism_score(lcs_lenght, susp_tokens) print(f'The plagiarism score: {score:.2f}\n') return score
def test_composite_phrases() -> None: s = "Orða- og tengingasetning." # 0123456789012345678901234 # ^ ^^ ^ ^ # x toks = tokenizer.parse_tokens(s) char_indexes, byte_indexes = tokenizer.calculate_indexes(toks) assert char_indexes == [0, 4, 5, 8, 24] assert byte_indexes == [0, 5, 6, 9, 25] toks = tokenizer.parse_tokens(s) char_indexes, byte_indexes = tokenizer.calculate_indexes(toks, last_is_end=True) assert char_indexes == [0, 4, 5, 8, 24, 25] assert byte_indexes == [0, 5, 6, 9, 25, 26] # The whole thing gets squished together into a single token. s = "Orða- og tengingasetning." # 0123456789012345678901234 # ^ ^ # x toks = tokenizer.tokenize(s) char_indexes, byte_indexes = tokenizer.calculate_indexes(toks) assert char_indexes == [0, 24] assert byte_indexes == [0, 25] toks = tokenizer.tokenize(s) char_indexes, byte_indexes = tokenizer.calculate_indexes(toks, last_is_end=True) assert char_indexes == [0, 24, 25] assert byte_indexes == [0, 25, 26]
def _clean(self, source, target, source_cleaned, target_cleaned, m): self.info('Cleaning...') source_in = codecs.open(source, 'rb', 'utf-8') target_in = codecs.open(target, 'rb', 'utf-8') source_out = codecs.open(source_cleaned, 'wb', 'utf-8') target_out = codecs.open(target_cleaned, 'wb', 'utf-8') for num_lines, _ in enumerate(source_in): pass source_in.seek(0, 0) pbar = ProgressBar(maxval=num_lines).start() for l in count(0): source_line = source_in.readline() target_line = target_in.readline() if not source_line or not target_line: break source_tokens = tokenize(source_line) target_tokens = tokenize(target_line) if len(source_tokens) == 0 or len(source_tokens) > m \ or len(target_tokens) == 0 or len(target_tokens) > m: continue source_out.write(' '.join(source_tokens) + '\n') target_out.write(' '.join(target_tokens) + '\n') pbar.update(l) pbar.finish() source_in.close() target_in.close() source_out.close() target_out.close()
def common_words(tweet1, tweet2): tweet_1 = tk.tokenize(tweet1) tweet_2 = tk.tokenize(tweet2) counts = 0 for i in tweet_1: if i in tweet_2: counts += 1 return counts
def test_slash_within_paren_works(self): tks = tk.tokenize('(my/example)') self.assertEqual(len(tks), 1) self.assertEqual(tks[0][0], ChunkType.Paren) tks = tk.tokenize('(my/example)', parse_slash=True) self.assertEqual(len(tks), 1, "expected one element, got: " + repr(tks)) self.assertEqual(tks[0][0], ChunkType.Paren)
def get_msr_feats(corpus): feats1 = [] feats2 = [] for sample in corpus: words1 = [word.lower() for word in tokenize(sample[1])] words2 = [word.lower() for word in tokenize(sample[2])] feats1.append([words1]) feats2.append([words2]) return feats1, feats2
def test_enclosing_chars_have_precedence_over_delimiters(self): tks = tk.tokenize('(a,b;c|d/e) {a,b;c|d/e} [a,b;c|d/e]') self.assertEqual(len(tks), 3) for chunk in tks: self.assertEqual(chunk[1], 'a,b;c|d/e') # test slash tks = tk.tokenize('x /a,b;c|d/ y', parse_slash=True) self.assertEqual(len(tks), 3) self.assertEqual(tks[1][1], 'a,b;c|d')
def test_only_expressions_with_no_spaces_withing_slash_slash_parsed(self): print("===") tks = tk.tokenize('/AB/', parse_slash=True) print("---") self.assertEqual(len(tks), 1) self.assertEqual(tks[0][0], ChunkType.Slash) tks = tk.tokenize('A / B/', parse_slash=True) self.assertEqual(len(tks), 1) self.assertEqual(tks[0][0], ChunkType.Word)
def get_tokens(obj): if isinstance(obj, basestring): return tokenize(obj) elif isinstance(obj, file): return tokenize(obj) else: # object not valid raise TypeError('Got unexpected object type {0!r}'.format( obj.__class__.__name__))
def get_kb_numbers(self, kb): title = tokenize(re.sub(r'[^\w0-9\.,]', ' ', kb.facts['item']['Title'])) description = tokenize(re.sub(r'[^\w0-9\.,]', ' ', ' '.join(kb.facts['item']['Description']))) numbers = set() for token in chain(title, description): try: numbers.add(float(self.process_string(token))) except ValueError: continue return numbers
def test_make_choosable_tokens(self): src = "a = 0 /* @ 1, 2, 3 */" tokens = tokenize(src) choosable_tokens = make_choosable_tokens(tokens) self.assertEqual(choosable_tokens, [ "a", " ", "=", " ", ["1", "2", "3"]]) src = "a = 0 /* @ 1, 2, 3 */0/* @ 1, 2, 3 */" tokens = tokenize(src) choosable_tokens = make_choosable_tokens(tokens) self.assertEqual(choosable_tokens, [ "a", " ", "=", " ", ["1", "2", "3"], ["1", "2", "3"]])
def test_generate_every_possible_code(self): src = "a = 3 /* @ 1, 2, 3 */" possible_codes = generate_every_possible_code( make_choosable_tokens(tokenize(src))) self.assertEqual(possible_codes, ["a = 1", "a = 2", "a = 3"]) src = "a = 3 /* @ 1, 2, 3 */0/* @ 1, 2, 3 */" possible_codes = generate_every_possible_code( make_choosable_tokens(tokenize(src))) self.assertEqual(possible_codes, [ "a = 11", "a = 12", "a = 13", "a = 21", "a = 22", "a = 23", "a = 31", "a = 32", "a = 33" ])
def test_iterator_cases() -> None: s = [ "Þessi ", "setning ", "er ", "í ", "lengra ", "lagi ", "og ", "er ", "með ", "bæði ", "eins ", "og ", "tveggja ", "bæta ", "stafi." ] # (char and byte indexes in a similar test above) toks = tokenizer.parse_tokens(s) char_indexes, byte_indexes = tokenizer.calculate_indexes(toks) assert char_indexes == [ 0, 5, 13, 16, 18, 25, 30, 33, 36, 40, 45, 50, 53, 61, 66, 72 ] assert byte_indexes == [ 0, 6, 14, 17, 20, 27, 32, 35, 38, 43, 50, 55, 58, 66, 72, 78 ] toks = tokenizer.parse_tokens(s) char_indexes, byte_indexes = tokenizer.calculate_indexes(toks, last_is_end=True) assert char_indexes == [ 0, 5, 13, 16, 18, 25, 30, 33, 36, 40, 45, 50, 53, 61, 66, 72, 73 ] assert byte_indexes == [ 0, 6, 14, 17, 20, 27, 32, 35, 38, 43, 50, 55, 58, 66, 72, 78, 79 ] s = ["Stutt setning.", "", "Önnur setning."] # 01234567890123 45678901234567 # ^ ^ ^ ^ ^ ^ # x toks = tokenizer.parse_tokens(s) char_indexes, byte_indexes = tokenizer.calculate_indexes(toks) assert char_indexes == [0, 5, 13, 14, 19, 27] assert byte_indexes == [0, 5, 13, 14, 20, 28] toks = tokenizer.parse_tokens(s) char_indexes, byte_indexes = tokenizer.calculate_indexes(toks, last_is_end=True) assert char_indexes == [0, 5, 13, 14, 19, 27, 28] assert byte_indexes == [0, 5, 13, 14, 20, 28, 29] # parse_tokens does some implentation-detail-stuff here. Use tokenize instead. s = [" Stutt setning. ", "\n \n", "Önnur setning."] # 0123456789012345 6 78 90123456789012 # ^ ^ ^^ ^ ^ # x toks = tokenizer.tokenize(s) char_indexes, byte_indexes = tokenizer.calculate_indexes(toks) assert char_indexes == [0, 6, 14, 15, 24, 32] assert byte_indexes == [0, 6, 14, 15, 25, 33] toks = tokenizer.tokenize(s) char_indexes, byte_indexes = tokenizer.calculate_indexes(toks, last_is_end=True) assert char_indexes == [0, 6, 14, 15, 24, 32, 33] assert byte_indexes == [0, 6, 14, 15, 25, 33, 34]
def iter_sents(ets, total): for et in tools.tqdm(ets, total=total): name = tokenize(et['name']) yield tools.replace_num(name.split()) desc = et.get('description') if desc: desc = tokenize(desc) yield tools.replace_num(desc.split()) for syn in et.get('synonyms', []) + et.get('mistypes', []): sname = tokenize(syn['name']) yield tools.replace_num(sname.split())
def build_canonical_line(vocab, cols): if cols[1].strip() == "not_entailment": return None label = cols[1].strip() if label in label_dict: label = label_dict[label] else: label = float(label) data = {"uid": cols[0].strip(), "label": label} if len(cols) == 3: token_id = [] type_id = [] seq1s = tokenizer.tokenize(cols[2].lower()) token_id.append(vocab["<cls>"]) for seq in seq1s: if seq in vocab: token_id.append(vocab[seq]) else: token_id.append(vocab["<unk>"]) token_id.append(vocab["<sep>"]) type_id.extend([0] * (len(seq1s) + 2)) data["token_id"] = token_id data["type_id"] = type_id elif len(cols) == 4: token_id = [] type_id = [] seq1s = tokenizer.tokenize(cols[2].lower()) seq2s = tokenizer.tokenize(cols[3].lower()) token_id.append(vocab["<cls>"]) for seq in seq1s: if seq in vocab: token_id.append(vocab[seq]) else: token_id.append(vocab["<unk>"]) token_id.append(vocab["<sep>"]) for seq in seq2s: if seq in vocab: token_id.append(vocab[seq]) else: token_id.append(vocab["<unk>"]) token_id.append(vocab["<sep>"]) type_id.extend([0] * (len(seq1s) + 2)) type_id.extend([1] * (len(seq2s) + 1)) data["token_id"] = token_id data["type_id"] = type_id else: print(cols) return None return data
def min_word_count(ex): try: isl_toks = [ tok for tok in tokenizer.tokenize(ex["is"]) if tok.txt is not None and tok.kind == tokenizer.TOK.WORD ] eng_toks = [ tok for tok in tokenizer.tokenize(ex["en"]) if tok.txt is not None and tok.kind == tokenizer.TOK.WORD ] except TypeError as e: return True return len(isl_toks) >= DEFAULT_MIN_WORD_COUNT and len( eng_toks) >= DEFAULT_MIN_WORD_COUNT
def process_query(query_words): query_type = query_words[:1] query_words = query_words[1:] if query_type == "1": # call tokenizer query_words = tokenizer.tokenize(query_words.replace("\n", " ")) # call stemmer(stem tokens) query_words = tokenizer.stem(query_words) # take set of query words to remove duplicates docs = conjunctive_matcher( list(sorted(set(query_words), key=query_words.index))) open("results.txt", "a").write(str(docs) + "\n") elif query_type == "2": # call tokenizer query_words = tokenizer.tokenize(query_words.replace("\n", " ")) # call stemmer(stem tokens) query_words = tokenizer.stem(query_words) docs = phrase_matcher(query_words) open("results.txt", "a").write(str(docs) + "\n") elif query_type == "3": # call tokenizer query_words = tokenizer.tokenize(query_words.replace("\n", " ")) # call stemmer(stem tokens) query_words = tokenizer.stem(query_words) # maps proximity index to proximity value e.g( 0 => /3 means proximity value after 0.th word ) proximity_index_dict = {} proximity_index = 0 for word in query_words: match = re.match("^\/\d+$", word) if match: proximity_index_dict.update({ (proximity_index - 1): int(match.group(0).replace("/", "")) }) else: proximity_index += 1 query_words = [ word for word in query_words if not re.match("^\/\d+$", word) ] for i in range(0, len(query_words)): if i not in proximity_index_dict.keys(): proximity_index_dict.update({i: 0}) docs = proximity_matcher(query_words, proximity_index_dict) open("results.txt", "a").write(str(docs) + "\n") else: print("Unsupported query type is given!!!")
def test_converted_measurements() -> None: s = "Stillið ofninn á 12° C til að baka kökuna." # 012345678901234567890123456789012345678901 # ^ ^ ^ ^ ^ ^ ^ ^ ^ # x x x x x toks = tokenizer.tokenize(s, convert_measurements=True) char_indexes, byte_indexes = tokenizer.calculate_indexes(toks) assert char_indexes == [0, 7, 14, 16, 22, 26, 29, 34, 41] assert byte_indexes == [0, 8, 15, 18, 25, 29, 33, 38, 46] toks = tokenizer.tokenize(s, convert_measurements=True) char_indexes, byte_indexes = tokenizer.calculate_indexes(toks, last_is_end=True) assert char_indexes == [0, 7, 14, 16, 22, 26, 29, 34, 41, 42] assert byte_indexes == [0, 8, 15, 18, 25, 29, 33, 38, 46, 47]
def export_data(lines): """ Parse "raw" ingredient lines into CRF-ready output """ output = [] for line in lines: line_clean = re.sub('<[^<]+?>', '', line) tokens = tokenizer.tokenize(line_clean) # Need a copy, otherwise somehow it gets used up tokens_copy = tokenizer.tokenize(line_clean) for i, token in enumerate(tokens_copy): features = getFeatures(token, i + 1, tokens) output.append(joinLine([token] + features)) output.append('') return '\n'.join(output)
def extract_answer(self,q_text,e_texts,word2id,char2id,maxlen=10,threshold=0.1): Qc,Qw,Ec,Ew= [],[],[],[] qc = list(q_text) Qc,q_mask=sent2id([qc],char2id) qw = alignWord2Char(tokenize(q_text)) Qw,q_mask_=sent2id([qw],word2id) assert torch.all(q_mask == q_mask_) tmp = [(list(e),alignWord2Char(tokenize(e))) for e in e_texts] ec,ew = zip(*tmp) Ec,e_mask=sent2id(list(ec),char2id) Ew,e_mask_=sent2id(list(ew),word2id) assert torch.all(e_mask == e_mask_) totensor=lambda x: torch.from_numpy(np.array(x)).long() L=[Qc,Qw,q_mask,Ec,Ew,e_mask] L=[totensor(x) for x in L] As_ , Ae_ = self.best_model(L) R={} for as_ ,ae_ , e in zip(As_,Ae_,e_texts): as_ ,ae_ = as_[:len(e)].numpy() , ae_[:len(e)].numpy() sidx = torch.where(as_>threshold)[0] eidx = torch.where(ae_>threshold)[0] result = { } for i in sidx: cond = (eidx >= i) & (eidx < i+maxlen) for j in eidx[cond]: key=e[i:j+1] result[key]=max(result.get(key,0),as_[i] * ae_[j]) if result: for k,v in result.items(): if k not in R: R[k]=[] R[k].append(v) # sort all answer R= [ [k,((np.array(v)**2).sum()/(sum(v)+1))] for k , v in R.items() ] R.sort(key=lambda x: x[1], reversed=True) # R 降序排列的 (answer, possibility) return R
def min_word_count(ex): isl_toks = [ tok for tok in tokenizer.tokenize(ex["is"]) if tok.txt is not None and tok.kind == tokenizer.TOK.WORD ] eng_toks = [ tok for tok in tokenizer.tokenize(ex["en"]) if tok.txt is not None and tok.kind == tokenizer.TOK.WORD ] return ( len(isl_toks) >= DEFAULT_MIN_WORD_COUNT and len(eng_toks) >= DEFAULT_MIN_WORD_COUNT )
def test(self, path): corp = Corpus(path) bs = Bayesian() count = 0 sender_bl = load_pickle('sender_bl.pickle') # scan email and define if msg is SPAM or HAM # first check if sender occurs in sender Blacklist # then count spamicity of the word using the Bayes approach for fname, body in corp.emails(): sender = find_sender(body) if sender in sender_bl: self.tag_it(path, fname, 'SPAM') continue spamicity_list = [] count += 1 tokens = tokenize(body) # compute spamicity for each word and create list of the values for el in tokens: word_spamicity = [el, bs.word_spamicity(el)] spamicity_list.append(word_spamicity) # prepare list for Bayes spamicity_list = [list(i) for i in set(map(tuple, spamicity_list))] # remove duplicates from list spamicity_list.sort(key=lambda x: abs(0.5 - x[1]), reverse=True) prediction = bs.bayes_pred(spamicity_list[:15]) # Consider only 15 'words' if prediction > 0.9 or sender in sender_bl: self.tag_it(path, fname, 'SPAM') else: self.tag_it(path, fname, 'OK')
def test_not(self): test_str = "(not (and true false))" actual_tokens = tokenize(test_str) consumed, remaining = S(actual_tokens) code = generate_code(consumed) print test_str print code
def build_index(docs): """VOTRE CODE ICI A partir de la collection des documents, construisez une structure des donnees qui vous permettra d'identifier des documents pertinents pour une question (e.g., l'index inversee qu'on a vu en classe). """ # Initialize index (empty list) index = {} print("Build index ... ") # Loop for all documents: 1400 for docID in docs: # Get frequencies for document number docID freqs = frequencies(tokenize(docs[docID])) # For each word in this document for word in freqs: # If word is not in our index if word not in index.keys(): # Add a new entry index[word] = [] # In all case, add a new value index[word].append((docID, freqs[word])) if docID % 140 == 0: percent = docID / 14 print(str(percent) + "%") print("Build index : Done") return index
def test_tokenize_quoted_string(self): input = 'name = "value one"' expected_output = ['name', '=', 'value one'] output = tokenizer.tokenize(input) for element_position in range(0, len(output)): self.assertTrue( output[element_position] == expected_output[element_position])
def main(argv = sys.argv): argv = argv[1:] if len(argv) and argv[0] == "--arabic": argv = argv[1:] lang = arabic.ArabicModule() known_words = read_known_file.read_known_file("arabic_known_words.txt", lang) | read_known_file.read_known_file("arabic_ignore_list.txt", lang) elif len(argv) and argv[0] == "--turkish": argv = argv[1:] lang = turkish.TurkishModule() known_words = read_known_file.read_known_file("turkish_known_words.txt", lang) | read_known_file.read_known_file("turkish_ignore_list.txt", lang) else: lang = french.FrenchModule() known_words = read_known_file.read_known_file("french_known_words.txt", lang) | read_known_file.read_known_file("french_ignore_list.txt", lang) uw = uniquify.UniqueWords() for fname in argv: data = filereader.read_file(fname) tokens = tokenizer.tokenize(data) uw.uniquify(tokens, lang) uw.weed_out_uninteresting_words(lang) uniquify.rank(uw, lang, known_words)
def extractCoordinates(self): self.inputfile = open(self.ifilename, "r") line = self.inputfile.readline() coords_times_list = [] i = 0 while len(line) > 0: i = i + 1 #print i try: tweet = jsonpickle.decode(line) except ValueError, e: print repr(e) line = self.inputfile.readline() continue if tweet.has_key("delete") or tweet.has_key("scrub_geo") or tweet.has_key("limit"): print "unimplemented data item" else: #print tweet["text"] text = tweet["text"] tweet_w = time.strptime(tweet["created_at"], "%a %b %d %H:%M:%S +0000 %Y") tokens = tokenizer.tokenize(text) if tweet.has_key("coordinates"): coord = tweet["coordinates"] if coord == None: print "coordinates null" elif coord.has_key("type") and coord["type"] == "Point": coords_times_list.append([coord["coordinates"], tweet_w]) else: print "not a point" line = self.inputfile.readline()
def compute_ave_words_in_sentence(self): sentences = tokenizer.split_sentence(self.text) average = 0 for sentence in sentences: average += len(tokenizer.tokenize(sentence)) self.ave_words_in_sentence = 1.0 * average / len(sentences) return self.ave_words_in_sentence
def main(): try: settings = open('settings.cfg', 'r').read() except IOError as e: print "Do you have your settings entered correctly?" exit(1) settings_obj = yaml.load(settings) input_path = os.path.abspath(settings_obj['in']) output_path = os.path.abspath(settings_obj['out']) extensions = settings_obj['extensions'] main_template = os.path.join(input_path, 'templates/main.html') try: main_template_html = open(main_template, 'r').read() except IOError as e: import html_templates main_template_html = html_templates.basic posts = os.path.join(input_path, 'posts/') for post in os.listdir(posts): if any([post.endswith(x) for x in extensions]): post_path = os.path.join(os.path.dirname(posts), post) tokens = tokenizer.tokenize(open(post_path, 'r').read()) parsed_post = parser.parse(tokens) parsed_page = main_template_html.safe_substitute(title=post, body=parsed_post) make_html_output(output_path, post, parsed_page)
def query(query, offset, rpp): # Load the indexed data ids = pickle.load(open(config.data_directory + '/monuments.ids', 'r')) dictionary = corpora.Dictionary.load(config.data_directory + '/monuments.dict') corpus = corpora.MmCorpus(config.data_directory + '/monuments.mm') lsi = models.LsiModel.load(config.data_directory + '/monuments.lsi') tfidf = models.TfidfModel.load(config.data_directory + '/monuments.tfidf') tfidfIndex = similarities.Similarity.load(config.data_directory + '/monuments.tfidf.index') lsiIndex = similarities.Similarity.load(config.data_directory + '/monuments.lsi.index') # Convert query to a tokenized document and project it as a vector in tfidf and lsi tokenized = tokenizer.tokenize(query) vector = dictionary.doc2bow(tokenized) tfidf_vector = tfidf[vector] lsi_vector = lsi[vector] # Determine how similar the query vector is to the other documents in # the same spaces (tfidf and lsi), and select the most similar documents tfidf_similarity = tfidfIndex[tfidf_vector] lsi_similarity = lsiIndex[lsi_vector] similarity = np.array(lsi_similarity) * np.array(tfidf_similarity) similarity = sorted(enumerate(similarity), key=lambda item: -item[1]) sims = similarity sims = [s for s in sims if s[1] > 0] offset = int(min(offset, len(sims))) results = [str(ids[sim[0]]) for sim in sims[offset:int(min(offset+rpp, len(sims)))]] # Print json result print json.dumps({ 'nrOfResults': len(sims), 'startResult': offset, 'endResult': min(offset+rpp, len(sims)), 'results': results})
def getScore(newtitle): query = tokenizer.tokenize(newtitle) res = idx.queryVector(query, 1) #print("{0} results.".format(len(res))) # Take average of (upvotes-downvotes) weighted by similarity score ^ 2 # but only for posts with simscore > max(simscore)/2 totalweight = 0.0 totalscore = 0.0 for n in res: simscore = n[1] simscore = simscore ** 0.5 #if simscore < 0.75: # continue post = postdata.posts[n[0]] #score = post["day"][1] - post["day"][2] # ups - downs^2 #score = post["day"][1] #score = post["day"][1] + post["num_comments"] score = post["day"][1] - post["day"][2] + post["num_comments"]*2.5 totalscore += float(score) * simscore totalweight += simscore #return float(score) # test if totalweight == 0: return 0.0 # couldn't make a score for this finalscore = (totalscore / totalweight) return finalscore
def guess(self, text): doc_counts = {} doc_inverse_counts = {} tokens = tokenize(text) scores = {} for label in self.labels: doc_counts[label] = self.doc_count(label) doc_inverse_counts[label] = self.doc_inverse_count(label) total = self.total_doc_count() for label in self.labels: logSum = 0.0 for word in tokens: stem_total_count = self.stem_total_count(word) if stem_total_count == 0.0: continue else: word_prob = self.stem_label_count(label, word) / doc_counts[label] word_inverse_prob = self.stem_inverse_label_count(label, word) / doc_inverse_counts[label] wordicity = word_prob / (word_prob + word_inverse_prob) wordicity = (( 1.0 * 0.5) + (stem_total_count * wordicity) ) / (1.0 + stem_total_count ) if wordicity == 0.0: wordicity = 0.01 elif wordicity == 1: wordicity = 0.99 try: logSum += math.log(1.0 - wordicity) - math.log(wordicity) except ValueError: print "ValueError" try: scores[label] = 1.0 / (1.0 + math.exp(logSum)) except OverflowError: print "OverflowError" return scores
def test_while(self): test_str = "(while (< 3 i) (assign i (+ i 1)))" expected_tokens = [Token("(", "L_PAREN"), Token("while", "E_WHILE"), Token("(", "L_PAREN"), Token("<", "O_LT"), Token("3", "V_INT"), Token("i", "V_STRING"), Token(")", "R_PAREN"), Token("(", "L_PAREN"), Token("assign", "E_ASSIGN"), Token("i", "V_STRING"), Token("(", "L_PAREN"), Token("+", "O_ADD"), Token("i", "V_STRING"), Token("1", "V_INT"), Token(")", "R_PAREN"), Token(")", "R_PAREN"), Token(")", "R_PAREN")] actual_tokens = tokenize(test_str) for actual, expected in izip(actual_tokens, expected_tokens): self.assertEqual(actual, expected) consumed, remaining = S(actual_tokens) self.assertEqual(remaining, []) if not remaining: print "accepted" print consumed
def rank_docs(index, query): """VOTRE CODE ICI Retournez la serie des docIDs ordonner par leur pertinence vis-a-vis la question 'query'. """ # Initialize new list ranking = {} for i in range(1, 1401): ranking[i] = 0 # For each word in query for word in tokenize(query): # If we have this word in our index if word in index.keys(): # For each document in which we can find the word for item in index[word]: # Increase the score ranking[item[0]] += item[1] # Sort the list with score sorted_ranking = sorted(ranking.items(), key=operator.itemgetter(1), reverse=True) ranking = [] for couple in sorted_ranking: ranking.append(couple[0]) return ranking
def build_lattice(self, pt, sentence): ''' Gets a phrase table and the tokenized sentence and outputs a lattice file formatted as follows: whole sentence 1-1: <English translation> <Translation score> <English translation> <Translation score> ... 1-2: <English translation> <Translation score> <English translation> <Translation score> ... 2-2: The spans n-n refer to the tokens of the input Spanish sentence ''' sentence = tokenize(sentence) self.sentence = sentence for start in xrange(len(sentence)): self.phrases[start] = {} for end in xrange(start+1, len(sentence)+1): foreign = sentence[start:end] p = Phrase(foreign, start, end) if len(foreign) == 1 and foreign[0] == ',': p.translations = [Translation(foreign, (',',), 0)] else: p.translations = pt.translate(foreign) self.phrases[start][end] = p
def _parse(self, path, content, addWords): words = tokenizer.tokenize(path, content) wordList = [] currNode = ParseNode(path, 0, None) currLine = [0, currNode] nodeId = 1 for token, start, type in words: if type == tokenizer.NOTHING: if addWords: self.words.add(token) wordList.append((token, start, currLine)) elif type == tokenizer.NEWLINE: wordList.append(('\\n', start, currLine)) prevLine = currLine currLine = [currLine[0]+1, currNode] elif type == tokenizer.DEDENT: wordList.append(('\\d', start, currLine)) currNode = currNode.parent currLine[1] = currNode elif type == tokenizer.INDENT: wordList.append(('\\i', start, currLine)) currNode = ParseNode(path, nodeId, currNode) nodeId += 1 prevLine[1] = currNode currLine[1] = currNode if len(wordList) == 0: wordList.append(('\\n', 0, currLine)) return wordList
def processFile(self): self.inputfile = open(self.ifilename, "r") line = self.inputfile.readline() i = 0 while len(line) > 0: i = i + 1 #print i try: tweet = jsonpickle.decode(line) except ValueError, e: print repr(e) line = self.inputfile.readline() continue if tweet.has_key("delete") or tweet.has_key("scrub_geo") or tweet.has_key("limit"): print "unimplemented data item" else: #print tweet["text"] text = tweet["text"] tweet_w = time.strptime(tweet["created_at"], "%a %b %d %H:%M:%S +0000 %Y") tokens = tokenizer.tokenize(text) #print tokens #print tokens[0] self.countTokens(tokens) self.recordTokens(tokens, tweet_w) line = self.inputfile.readline()
def indexDir(dirname): basename = os.path.basename(dirname.rstrip("/")) indexFile = open('./indexes/%s_index' % basename, 'w'); idMap = {} indexDict = {} docId = 0 for (root, dirnames, filenames) in os.walk(dirname): for filename in filenames: if (re.search("\.sw[op]$", filename) == None): with open(os.path.join(root, filename), 'r') as fh: idMap[docId] = filename tokens = tokenize(fh) for (pos, token) in tokens: token = stem(alias(token)) try: positionMap = indexDict[token] try: positionMap[docId].append(pos) except KeyError: positionMap[docId] = [pos] except KeyError: indexDict[token] = {docId: [pos]} docId += 1 fullIndex = {"id_map" : idMap, "index" : indexDict} indexFile.write(json.dumps(fullIndex)) indexFile.close()
def _phrase_search(user, query): n = normalize(query) keywords = tokenize(n) logging.info('phrase_search: query: '+query) logging.info('n: '+n) logging.info('keywords:' + str(keywords)) if not len(keywords): return [] logging.info('%d - %s' % (0, keywords[0])); results = _lookup(user, keywords[0]) if not results: return [] logging.info('%s' % str(results)); for i in range(1, len(keywords)): logging.info('%d - %s' % (i, keywords[i])); id_pos_dict = _lookup(user, keywords[i]) logging.info('%s' % str(id_pos_dict)); if id_pos_dict: for id in results.keys(): if id not in id_pos_dict: del results[id] else: poses = [] for pos in id_pos_dict[id]: if pos - 1 in results[id]: poses.append(pos) if not len(poses): del results[id] else: results[id] = poses else: return [] return results.keys()
def parse_paragraph(parag, mim_tags, fast_p): """ Parse a single paragraph in free text form and compare to MIM POS tags """ tokens = tokenize(parag) tlist = list(tokens) result = parse_tokens(tlist, mim_tags, fast_p) print("{0}\n--> {1} sentences, {2} parsed".format(parag, result["num_sent"], result["num_parsed_sent"]))
def test_simple_file(self): input = """#include GLFW_INCLUDE_GLU #include <GLFW/glfw3.h> #include <cstdio> /* Random function */ static void glfw_key_callback(int key, int scancode, int action, int mod){ if(glfw_key_callback){ // Comment here input_event_queue->push(inputaction); } }""" (final_stats, final_tokens, file_times) = tokenizer.tokenize(input, comment_inline_pattern, comment_open_close_pattern, separators) (file_hash,lines,LOC,SLOC) = final_stats (tokens_count_total,tokens_count_unique,token_hash,tokens) = final_tokens self.assertEqual(lines,11) self.assertEqual(LOC,10) self.assertEqual(SLOC,8) self.assertEqual(tokens_count_total,24) self.assertEqual(tokens_count_unique,18) self.assert_common_properties(tokens) hard_tokens = set(['int@@::@@4','void@@::@@1','cstdio@@::@@1','action@@::@@1','static@@::@@1','key@@::@@1','glfw_key_callback@@::@@1','mod@@::@@1','if@@::@@1','glfw3@@::@@1','scancode@@::@@1','h@@::@@1','GLFW_INCLUDE_GLU@@::@@1','input_event_queue@@::@@2','GLFW@@::@@1','push@@::@@1','inputaction@@::@@1','include@@::@@3']) this_tokens = set(tokens[3:].split(',')) self.assertTrue(len(hard_tokens - this_tokens),0) m = hashlib.md5() m.update(tokens[3:]) self.assertEqual(m.hexdigest(),token_hash)
def __init__(self, source): self.numTemps = 0 self.macros = [] self.mlMacros = [] for mem in dir(self): mem = getattr(self, mem) if isinstance(mem, type) and issubclass(mem, Macro): if issubclass(mem, MLMacro): self.mlMacros.append(mem(self)) else: self.macros.append(mem(self)) self.macros.sort() self.mlMacros.sort() tokens = tokenizer.tokenize(source) pprint.pprint(tokens) code = self.compile(tokens) pprint.pprint(code) code = Module( None, Stmt(code) ) set_filename('<macropy>', code) self.compiled = ModuleCodeGenerator(code).getCode()
def test_math2(self): test_str = "(* 34 (- 23 45))" actual_tokens = tokenize(test_str) consumed, remaining = S(actual_tokens) code = generate_code(consumed) print test_str print code
def test_sin(self): test_str = "(sin 3.2)" actual_tokens = tokenize(test_str) consumed, remaining = S(actual_tokens) code = generate_code(consumed) print test_str print code
def test_lt_gt(self): test_str = "(and (< 3 5) true)" actual_tokens = tokenize(test_str) consumed, remaining = S(actual_tokens) code = generate_code(consumed) print test_str print code
def compute(topic): raw, ref = get_article(topic) sent = tokenize(raw) df = pd.DataFrame() ratio = len(ref) / len(raw) # TextRank result = text_rank(raw, sent, ref) r = Rouge() rouge = r.get_scores(result, ref) df = df.append(gen_serie('TextRank', rouge, result), ignore_index=True) # Gensim ret = summarize(raw, ratio) r = Rouge() rouge = r.get_scores(ret, ref) df = df.append(gen_serie('Gensim', rouge, ret), ignore_index=True) # KMean df = df.append(kmean(sent, ret)) # Cosine df = df.append(cosine(sent, ref), ignore_index=True) # Rearrange columns df = df[columns] df.to_csv('out/' + topic + '.csv') return df.to_json(orient='records')
def test_bool_value(self): test_str = "(iff (and (or true false) true) true)" actual_tokens = tokenize(test_str) consumed, remaining = S(actual_tokens) code = generate_code(consumed) print test_str print code
def test_tokenize_multi_propety(self): input = "name = {\n\tvalue1 value2\n}" expected_output = ['name', '=', '{', 'value1', 'value2', '}'] output = tokenizer.tokenize(input) for element_position in range(0, len(output)): self.assertTrue( output[element_position] == expected_output[element_position])
def test_math1(self): test_str = "(+ (- 234.3 1.1) 23)" actual_tokens = tokenize(test_str) consumed, remaining = S(actual_tokens) code = generate_code(consumed) print test_str print code
def test_tokenize_single_property(self): input = "name = value" expected_output = ['name', '=', 'value'] output = tokenizer.tokenize(input) for element_position in range(0, len(output)): self.assertTrue( output[element_position] == expected_output[element_position])
def generate_model(input_filename, output_filename=None): if output_filename is None: input_file, input_ext = os.path.splitext(input_filename) output_filename = input_file + '.dat' model = { 'id2word': [], 'word2id': {}, 'wordgrams': [], 'normalizedgrams': [], 'words_count': 0, 'normalized_count': 0, 'words_sum': 0, 'coefficients': { 'wordgrams': [1] * WORDGRAMS_SIZE, 'normalizedgrams': [1] * NORMALIZEDGRAMS_SIZE, }, 'unknown': 0.0, } # READ TOKENS with open(input_filename, 'r') as _file: tokens = tokenizer.tokenize(_file.read()) print('##', input_filename) make_dictionary(tokens, model) gather_wordgrams(tokens, model) gather_normalizedgrams(tokens, model) calculate_unknown(tokens, model) calculate_coefficients(model) save_model(model, output_filename)