def __init__(self, seed=42): super(Solver, self).__init__() self.is_train_task = False self.morph = pymorphy2.MorphAnalyzer() self.toktok = ToktokTokenizer() self.seed = seed self.init_seed()
def __init__(self, seed=42): self.seed = seed self.init_seed() self.morph = morph self.toktok = ToktokTokenizer() self.bert = BertEmbedder() self.is_loaded = True
def __init__(self): self.wl = set() for each in brown.words(): each = each.lower() if each.isalpha() and (each not in self.wl): self.wl.add(each) self.toktok = ToktokTokenizer()
def tokenize(i_file, o_file): toktok = ToktokTokenizer() with open(i_file, 'r') as i_f, open(o_file, 'w') as o_f: for line in tqdm(i_f): line = line.rstrip('\n') tokens = toktok.tokenize(line) print(' '.join(tokens), file=o_f)
def __init__(self, seed=42, data_path = 'data/'): self.is_train_task = False self.morph = pymorphy2.MorphAnalyzer() self.toktok = ToktokTokenizer() self.seed = seed self.init_seed() #self.synonyms = open(os.path.join(data_path, r'synonyms.txt'), 'r', encoding='utf8').readlines() #self.synonyms = [re.sub('\.','', t.lower().strip('\n')).split(' ') for t in self.synonyms] #self.synonyms = [[t for t in l if t] for l in self.synonyms] self.synonyms = open('./data/synmaster.txt', 'r', encoding='utf8').readlines() self.synonyms = [re.sub('\.','', t.lower().strip('\n')).split('|') for t in self.synonyms if len(t)>5] self.antonyms = open('./data/antonyms.txt', 'r', encoding='utf8').readlines() self.antonyms = [re.sub('\.','', t.lower().strip('\n')).split('|') for t in self.antonyms if len(t)>5] #self.antonyms = open(os.path.join(data_path, r'antonyms.txt'), 'r', encoding='utf8').readlines() #self.antonyms = [t.strip(' \n').split(' - ') for t in self.antonyms] self.phraseology = [] self.razgov = [] self.musor = [] with open('./data/word.csv', encoding='utf-8') as f: for line in f: line = line.replace("\n","").split('|') if line[1] == '23': self.phraseology.append(line[0]) #self.musor.append(line[0]) if line[1] == '25': self.razgov.append(line[0]) #self.musor.append(line[0]) if line[1] == '99': self.musor.append(line[0])
def file_to_features(path, word_vocab, window, min_count, total_w): examples = [] toktok = ToktokTokenizer() punckt = set(string.punctuation) try: with open(path, 'r', encoding='utf8') as f: for line in f: for sentence in sent_tokenize(line): words_1 = toktok.tokenize(sentence) words_2 = [] for i, word in enumerate(words_1): word_l = word.lower() if word_l not in word_vocab: continue if word_vocab[word_l] < min_count: continue if word in punckt: continue frequency = word_vocab[word_l] / total_w number = 1 - math.sqrt(10e-5 / frequency) if random.uniform(0, 1) <= number: continue words_2.append(word) max_j = len(words_2) for i, word in enumerate(words_2): start = i - window if (i - window) > 0 else 0 to = i + window if (i + window) < max_j else max_j for j in range(start, to): if i == j: continue target = words_2[j] examples.append((word, target)) except Exception as error: print(error) return examples
def __init__(self, log_dir, from_log=False): self.log_dir = log_dir if checkExistenceFile(os.path.join(log_dir, "PreprocessorConfig.json")): self.read_config() self.tok = ToktokTokenizer() self.parsing_char_ = sha1(b"sally14").hexdigest() self.fitted = False if from_log: self.fitted = True with open( os.path.join(self.log_dir, "vocabulary.json"), "r", encoding="utf-8", ) as f: self.vocabulary_ = json.load(f) with open( os.path.join(self.log_dir, "WordPhrases.json"), "r", encoding="utf-8", ) as f: p = json.load(f) self.phrasewords_ = { i.replace("_", self.parsing_char_): p[i] for i in p.keys() }
def __init__(self, seed=42, data_path='data/'): self.is_train_task = False self.morph = pymorphy2.MorphAnalyzer() self.toktok = ToktokTokenizer() self.seed = seed self.init_seed() self.synonyms = open(os.path.join(data_path, r'synonyms.txt'), 'r', encoding='utf8').readlines() self.synonyms = [ re.sub('\.', '', t.lower().strip('\n')).split(' ') for t in self.synonyms ] self.synonyms = [[t for t in l if t] for l in self.synonyms] self.antonyms = open(os.path.join(data_path, r'antonyms.txt'), 'r', encoding='utf8').readlines() self.antonyms = [t.strip(' \n').split(' - ') for t in self.antonyms] self.phraseology = open(os.path.join(data_path, r'phraseologs.txt'), 'r', encoding='utf8').readlines() self.phraseology = [[ l for l in self.lemmatize(l) if l not in ['\n', ' ', '...', '', ',', '-', '.', '?', r' (', r'/'] ] for l in self.phraseology]
def build_vocabs(directory_path, min_count): """Build the word and char counter vocabularies""" toktok = ToktokTokenizer() word_vocab = Counter() char_vocab = Counter() char_vocab.update(['{', '}']) filenames = os.listdir(directory_path) filepaths = [os.path.join(directory_path, e) for e in filenames] for i, filepath in enumerate(filepaths): if i % 100 == 0: print('Reading file number {}'.format(i), end="\r") with open(filepath, 'r', encoding='utf8') as f: try: line = f.read() if 'numbers_' in filepath: tmp = toktok.tokenize(line.lower()) for i in range(min_count): word_vocab.update(tmp) else: word_vocab.update(word_tokenize(line.lower())) char_vocab.update(line) except Exception as error: print('Error with file: {}'.format(filepath)) print(error) return word_vocab, char_vocab
def extract_wiki_fdict(): f_count = 0 # for each wiki table, get header name, and corresponding content f = open(wiki_path, 'r') f_dest = open(wiki_fdict_path, 'w') toktok = ToktokTokenizer() tid = 0 pool = mp.Pool() for line in f: tid += 1 t = json.loads(line) if not check_format(t): continue try: # header process header_iter = iter(t['tableHeaders'][-1]) header_span = [] header_content = dict() header_bows = dict() header_idx = 0 for each_header in header_iter: html_desc = each_header['tdHtmlString'] span = int(html_desc.split('colspan="')[1].split('"')[0]) header_span.append((each_header['text'], span)) header_content[header_idx] = [] header_bows[header_idx] = [] header_idx += 1 if span != 1: for skip_num in range(span - 1): next(header_iter) # content process for row in t['tableData']: global_col_index = 0 header_idx = 0 for header, span in header_span: for idx in range(span): if row[global_col_index]['text'] != '': header_content[header_idx].append( row[global_col_index]['text']) header_bows[header_idx].extend( toktok.tokenize(row[global_col_index]['text'])) global_col_index += 1 header_idx += 1 except: continue #combine header and features cols_features = pool.map(gov_data.get_col_features, list(header_content.values())) all_col_features = list( zip([each[0] for each in header_span], cols_features)) for i in range(len(all_col_features)): if all_col_features[i][1]: all_col_features[i][1]['content'] = header_bows[i] all_col_features = list(filter(lambda x: x[1], all_col_features)) f_dest.write(json.dumps({tid: all_col_features}, cls=MyEncoder) + '\n') print("finishing {0}".format(f_count)) f_count += 1
def loss_char(sentence, position): toktok = ToktokTokenizer() if sentence[position] in " ,./;'[]\<>?:{}!@#$% ^&*()": return sentence if sentence[position] == " ": return sentence if sentence[position] in toktok.tokenize(sentence): return sentence return sentence[:position] + sentence[position + 1:]
def RemoveWords_by_tag(text): remove_tag_list = ['JJ', 'JJR', 'JJS', 'RBR', 'RBS'] token = ToktokTokenizer() words = token.tokenize(text) words_tagged = nltk.pos_tag(words) filtered = untag([ w for w in words_tagged if not w[1] in remove_tag_list ]) # Filtre les mots qui n'appartiennt pas à la catégorie à supprimer return ' '.join(map(str, filtered))
def extract_wiki_features(wiki_feature_path, wiki_bow_path): f_count = 0 # for each wiki table, get header name, and corresponding content f = open(wiki_path, 'r') f_dest = open(wiki_feature_path, 'w') f_bow = open(wiki_bow_path, 'w') toktok = ToktokTokenizer() for line in f: t = json.loads(line) if not check_format(t): continue try: # header process header_iter = iter(t['tableHeaders'][-1]) header_span = [] header_content = dict() header_bows = dict() header_idx = 0 for each_header in header_iter: html_desc = each_header['tdHtmlString'] span = int(html_desc.split('colspan="')[1].split('"')[0]) header_span.append((each_header['text'], span)) header_content[header_idx] = [] header_bows[header_idx] = [] header_idx += 1 if span != 1: for skip_num in range(span - 1): next(header_iter) # content process for row in t['tableData']: global_col_index = 0 header_idx = 0 for header, span in header_span: for idx in range(span): if row[global_col_index]['text'] != '': header_content[header_idx].append( row[global_col_index]['text']) header_bows[header_idx].extend( toktok.tokenize(row[global_col_index]['text'])) global_col_index += 1 header_idx += 1 except: continue #combine header and features for col, f_dict, bows in zip([each[0] for each in header_span], map(get_col_features, header_content.values()), header_bows.values()): if f_dict: f_dict['_id'] = t['_id'] f_dest.write(json.dumps({col: f_dict}) + '\n') f_bow.write(json.dumps({col: bows}) + '\n') print("finishing {0}".format(f_count)) f_count += 1
def extract_gov_fdict(all_resources, fdict_path=gov_data_fdict_path, tid_type='cat_id', restrict_resource=False): #extracting features: #table_id;label,curated_features,content;label,curated_features... f = open(fdict_path, 'w') #all_resources = gov_data.read_resources() all_resources = gov_data.wrong_csv(all_resources) all_resources = list(filter(lambda x: x.status, all_resources)) if restrict_resource: all_resources = gov_data.select_resources(all_resources, fsize=50, rs_ct=len(all_resources)) pool = mp.Pool() total = len(all_resources) count = 0 toktok = ToktokTokenizer() for resource in all_resources: print("processing {0}-th resource".format(count)) for each_data in resource.data_files: try: if tid_type == 'cat_id': tid = resource.rs_id + ':' + each_data.df_id elif tid_type == 'path': tid = resource.path + '/' + each_data.df_id d_path = each_data.path + '/data.csv' df = pd.read_csv(d_path, delimiter=',', quotechar='"', dtype=str, na_filter=True) cols = df.columns contents = [ df[each_col].dropna().tolist() for each_col in cols ] print("extract content finished") cols_features = pool.map(gov_data.get_col_features, contents) all_col_features = list(zip(cols, cols_features)) for i in range(len(all_col_features)): if all_col_features[i][1]: all_col_features[i][1]['content'] = toktok.tokenize( ' '.join(contents[i])) all_col_features = list( filter(lambda x: x[1], all_col_features)) f.write( json.dumps({tid: all_col_features}, cls=MyEncoder) + '\n') except Exception as e: print(e) count += 1 print("finish {0} out of {1}".format(count, total)) f.close() return all_resources
def index(request): global invertedIndex global jsonData output_links = [] searchTermsReq = request.GET.get('term', '') print(searchTermsReq) tokenizer = ToktokTokenizer() searchTerms = tokenizer.tokenize(searchTermsReq) print(searchTerms) response = {} output_data = defaultdict(int) output_links = [] for token in searchTerms: token = token.lower() if invertedIndex[token]['idf'] > 0.25 and len(token) > 1: print('Looking through high for: ' + token) for docFilePath in invertedIndex[token]['high']: tfidf = invertedIndex[token]['high'][docFilePath] output_data[docFilePath] += tfidf if (len(output_data) < 10): for token in searchTerms: token = token.lower() if invertedIndex[token]['idf'] > 0.25 and len(token) > 1: print('Looking through low for: ' + token) for docFilePath in invertedIndex[token]['low']: tfidf = invertedIndex[token]['low'][docFilePath] output_data[docFilePath] += tfidf output_data = sorted(output_data.items(), key=itemgetter(1), reverse=True) for docFilePath, tfidf in output_data[:10]: output_links.append((jsonData[docFilePath], tfidf)) output_links.sort(key=itemgetter(1), reverse=True) response['term'] = searchTermsReq response['results'] = output_links response['totalURLs'] = len(output_data) response['uniqueTokens'] = len(invertedIndex) response['totalDocuments'] = len(jsonData) return JsonResponse(response)
def __init__(self): self.morph = morph self.mystem = Mystem() self.tokenizer = ToktokTokenizer() self.w2v = Word2vecProcessor() self.synonyms = None self.antonyms = None self.phraseology = None self.phraseologisms = None self.prep_synon = None self.set_f = None self.verbs_dict = None self.chasti_rechi = None self.set_f_2 = None
def write_in_file(): """ """ # Get the files list_of_files = os.listdir(TRAINING_FOLDER_PATH) number_of_file = len(list_of_files) # Initialise the lemmatizer lemmatizer = nltk.WordNetLemmatizer() # Initialise the tokenizer tokenizer = ToktokTokenizer() tokenizer.AMPERCENT = re.compile('& '), '& ' tokenizer.TOKTOK_REGEXES = [ (regex, sub) if sub != '& ' else (re.compile('& '), '& ') for (regex, sub) in ToktokTokenizer.TOKTOK_REGEXES ] toktok = tokenizer.tokenize total_text = pd.Series([]) # loop in the files for i in range(0, 11): file_name = list_of_files[i] print(i) # open the files with open(os.path.join(TRAINING_FOLDER_PATH, file_name), 'r', encoding="utf8") as text: string_text = text.read() splitted = toktok(string_text) # Lemmatize lemmatized = [lemmatizer.lemmatize(t) for t in splitted] tokens = pd.Series(lemmatized) # Take off random punctuation # All the numbers under the same name tokens.loc[tokens.apply(lambda x: x.isnumeric())] = "NUMBER" total_text = total_text.append(tokens) # Write in a file txtfilename = "training_text_file/" + str(i + 1) + "yo.txt" with io.open(txtfilename, "w", encoding="utf-8") as f: for item in total_text: f.write("%s " % item)
def error_generator(utterance): toktok = ToktokTokenizer() length = len(utterance) nb = nb_of_errors_in_utterance(length) + 1 utterance = utterance + " " for i in range(nb): length = len(utterance) - 1 position = np.random.choice(range(length), p=(length) * [1 / (length)]) l = len(toktok.tokenize(utterance)) utterance_old = utterance nb = np.random.randint(1, 5) utterance = functions[nb](utterance, position) return utterance
def clean_archive_data(folder): toktok = ToktokTokenizer() if not os.path.exists(f"{folder}-cleaned"): os.makedirs(f"{folder}-cleaned") for count, file in enumerate(os.listdir(f"{folder}")): if count % 1000 == 0: print(count) file_data = open(f"{folder}/{file}", "r").read() try: text_newspaper = toktok.tokenize(fulltext(file_data)) text_newspaper_cleaned = clean(" ".join(text_newspaper)) with open(f"{folder}-cleaned/{file}", "w") as output: output.write(text_newspaper_cleaned) except: # pylint: disable=W0702 print(f"error with {file}", file=sys.stderr)
def preprocess(data): X, Y = [], [] toktok = ToktokTokenizer() for index, review in data.iterrows(): if (index+1) % 100000 == 0: print(index+1) # words = nltk.word_tokenize(review['text']) tokens = toktok.tokenize(review['text'].lower()) X.append(tokens) # X.append(nltk.word_tokenize(review['text'])) Y.append(int(review['stars'] - 1)) # if len(Y) == 10000: # break df_new = pd.DataFrame({'text': X, 'stars': Y}) return df_new
def load_tokenizer(lang): if lang == "ko": from konlpy.tag import Mecab tokenizer = Mecab() elif lang == "ja": import Mykytea opt = "-model jp-0.4.7-1.mod" tokenizer = Mykytea.Mykytea(opt) elif lang == "zh_cn": import Mykytea opt = "-model ctb-0.4.0-1.mod" tokenizer = Mykytea.Mykytea(opt) elif lang == "zh_tw": import jieba tokenizer = jieba elif lang == "vi": from pyvi import ViTokenizer tokenizer = ViTokenizer elif lang == "th": from pythainlp.tokenize import word_tokenize tokenizer = word_tokenize elif lang == "ar": import pyarabic.araby as araby tokenizer = araby # elif lang=="en": # from nltk import word_tokenize # tokenizer = word_tokenize else: from nltk.tokenize import ToktokTokenizer tokenizer = ToktokTokenizer() return tokenizer
def makeModel(): #sentences = webtext.raw()+brown.raw()+reuters.raw() sentences = webtext.raw() + reuters.raw() # Tokenize the sentences try: # Use the default NLTK tokenizer. from nltk import word_tokenize, sent_tokenize # Testing whether it works. # Sometimes it doesn't work on some machines because of setup issues. word_tokenize( sent_tokenize("This is a foobar sentence. Yes it is.")[0]) except: # Use a naive sentence tokenizer and toktok. import re from nltk.tokenize import ToktokTokenizer # See https://stackoverflow.com/a/25736515/610569 sent_tokenize = lambda x: re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', x) # Use the toktok tokenizer that requires no dependencies. toktok = ToktokTokenizer() word_tokenize = word_tokenize = toktok.tokenize tokenized_text = [ list(map(str.lower, word_tokenize(sent))) for sent in sent_tokenize(sentences) ] # Make it ready for making 3 grams n = 5 train_data, padded_sents = padded_everygram_pipeline(n, tokenized_text) model = MLE(n) # Lets train a 3-grams model, previously we set n=3 model.fit(train_data, padded_sents) #print(model.vocab) return model
def tokenize_nltk(self, text): toktok = ToktokTokenizer() # tokens =[toktok.tokenize(sent) for sent in sent_tokenize(text)] tokens = nltk.word_tokenize(text) stems = [stemmer.stem(t) for t in tokens] # print("Number of tokens:", len(tokens)) return stems
def prepare_data(args): UNK_TOKEN = "<unk>" PAD_TOKEN = "<pad>" SOS_TOKEN = "<s>" EOS_TOKEN = "</s>" if args.tokenize: toktok = ToktokTokenizer() SRC = data.Field(unk_token=UNK_TOKEN, pad_token=PAD_TOKEN, init_token=None, eos_token=EOS_TOKEN, lower=args.lower, tokenize=toktok.tokenize) TGT = data.Field(unk_token=UNK_TOKEN, pad_token=PAD_TOKEN, init_token=SOS_TOKEN, eos_token=EOS_TOKEN, lower=args.lower, tokenize=toktok.tokenize) else: SRC = data.Field(unk_token=UNK_TOKEN, pad_token=PAD_TOKEN, init_token=None, eos_token=EOS_TOKEN, lower=args.lower) TGT = data.Field(unk_token=UNK_TOKEN, pad_token=PAD_TOKEN, init_token=SOS_TOKEN, eos_token=EOS_TOKEN, lower=args.lower) MAX_LEN = args.max_lenght if args.iwslt: datatype = IWSLT14 else: datatype = datasets.Multi30k train_data, val_data, test_data = datatype.splits( exts=('.de', '.en'), fields=(SRC, TGT), filter_pred=lambda x: len(vars(x)['src']) <= MAX_LEN and len(vars(x)['trg']) <= MAX_LEN ) MIN_FREQ = args.min_freq SRC.build_vocab(train_data.src, min_freq=MIN_FREQ) TGT.build_vocab(train_data.trg, min_freq=MIN_FREQ) return train_data, val_data, test_data, SRC, TGT
def build_vocabs(filepath, min_count): """Build the word and char counter vocabularies""" toktok = ToktokTokenizer() word_vocab = Counter() char_vocab = Counter() with open(filepath, 'r', encoding='utf8') as f: try: line = f.read() if 'numbers_' in filepath: tmp = toktok.tokenize(line.lower()) for i in range(min_count): word_vocab.update(tmp) else: word_vocab.update(word_tokenize(line.lower())) char_vocab.update(line) except Exception as error: print('Error with file: {}'.format(filepath)) print(error) return word_vocab, char_vocab
def tokenize_nltk(self, text): self.tokenizer_counter += 1 logger.debug("item:" + str(self.tokenizer_counter) + "/" + str(self.tokenizer_len)) toktok = ToktokTokenizer() # tokens =[toktok.tokenize(sent) for sent in sent_tokenize(text)] tokens = nltk.word_tokenize(text) # logger.debug("Number of tokens:" + str(len(tokens))) stems = [self.stemmer.stem(t) for t in tokens] return stems
def word_frequencies(contents): toktok = ToktokTokenizer() string_corpus = brown.raw() # Frequencies for each file list = [] for file in contents.keys(): print("Tokenising", file) tokenised = [ toktok.tokenize(sent) for sent in sent_tokenize(string_corpus) ] fdist = Counter(chain(*tokenised)) list.append(fdist) # Combine keys into one set, eliminating duplicates print("Making frequency distribution of all words that we care about.") keys = [] for sublist in list: keys += sublist keys = set(keys) # Build combined frequency dict # Tuple of identifiers for connectives and other common words unwanted = ('at', 'to', 'in', 'ma', 'bez', 'ppss', 'pp$', 'dt', 'bedz', 'hv', 'cc', 'cs', 'hvd', 'wdt', '*', 'bed', 'ber', 'be', 'np$', 'ppo', 'pps', 'abn', 'cd', 'md', 'ben', 'ben', 'wps', 'vbd', 'jj', 'rb', 'do', 'ql', 'dts', 'rp', 'in-tl', 'ex', 'i', 'dti', 'dod', 'wrb', 'hvz', 'nn$') # This is far from the best way to do this, but I couldn't find the documentation for these identifiers frequencies = {} for key in keys: total = 0 if (key[0] not in string.punctuation) and ( key.split('/')[-1] not in unwanted): # Gets rid of unwanted tokens for sublist in list: if key in sublist.keys(): total += sublist[key] frequencies[key.split('/')[0].lower()] = total print("Total words (that we care about): " + str(len(frequencies.keys()))) return frequencies
def __init__(self, doc_map): self.posting_list = {} self.mine = ['br','\'','http','url','web','www','blp','ref','external','links'] self.stop_words = set(stopwords.words('english')).union(self.mine) # self.ps = PorterStemmer().stem self.ps = SnowballStemmer("english").stem self.tokenizer = RegexpTokenizer(r'[a-zA-Z]+|[0-9]{,4}') self.d = doc_map self.sent = nltk.data.load('tokenizers/punkt/english.pickle').tokenize self.toktok = ToktokTokenizer()
def __init__(self, doc_map): self.posting_list = {} self.mine = [ 'br', '\'', 'http', 'url', 'web', 'www', 'blp', 'ref', 'external', 'links' ] self.stop_words = set(stopwords.words('english')).union(self.mine) self.ps = PorterStemmer().stem self.tokenizer = RegexpTokenizer(r'[a-zA-Z0-9]+') self.d = doc_map self.t = 0 self.toktok = ToktokTokenizer()
def __init__(self, stopwords: Set[str]) -> None: self.stopwords = stopwords self.ps = WordNetLemmatizer() self.stemmer = SnowballStemmer("english") self.tokenizer = ToktokTokenizer() self.puncuation = set(string.punctuation) # self.words = set(nltk.corpus.words.words()) self.pipeline = [ self.remove_punctuation, self.tokenize, self.lowering, self.remove_words, self.remove_stopwords, self.remove_digits_and_punctuation, self.remove_dangling_puncuation, self.remove_single, self.stemm, self.remove_starting_with_file, ] self.words_to_remove = set( "edit wookieepedia format registerr wrapup wiki sandbox click edit page link code preview button format" .split(" "))