def text_cleaner(text): negations_dictionary = { "isn't": "is not", "aren't": "are not", "wasn't": "was not", "weren't": "were not", "haven't": "have not", "hasn't": "has not", "hadn't": "had not", "won't": "will not", "wouldn't": "would not", "don't": "do not", "doesn't": "does not", "didn't": "did not", "can't": "can not", "couldn't": "could not", "shouldn't": "should not", "mightn't": "might not", "mustn't": "must not" } negations_pattern = re.compile(r'\b(' + '|'.join(negations_dictionary.keys()) + r')\b') tokenizer = WordPunctTokenizer() processed_text = text.lower() negation_handled = negations_pattern.sub( lambda x: negations_dictionary[x.group()], processed_text) processed_text = re.sub("[^A-Za-z]", ' ', negation_handled) words = [x for x in tokenizer.tokenize(processed_text) if len(x) > 1] return words
class NewsgroupsReader(object): def __init__(self, tokenize): self._tokenize = tokenize self._tokenizer = WordPunctTokenizer() def get_training(self): return self._get_docs('datasets/20news-bydate-train') def get_test(self): return self._get_docs('datasets/20news-bydate-test') def _get_docs(self, path): doc_objects = [] i = 0 for category in listdir(path): for f in listdir(path + "/" + category): with codecs.open(path + "/" + category + "/" + f, 'r', encoding='latin1') as content_file: text = content_file.read() tokens = self._tokenizer.tokenize(text) if self._tokenize else text doc_objects.append(Document(i, tokens, category)) i += 1 random.shuffle(doc_objects) return doc_objects
def filter_stop_words(text, stop_words): wpt = WordPunctTokenizer() tokenized_words = wpt.tokenize(text) processed_words = [word for word in tokenized_words if not word in stop_words] text = ' '.join([str(word) for word in processed_words]) return text
class PunctTokenizer(object): def __init__(self, lower=True, prepend_cls=False, prepend_bos=False, append_eos=False, stopwords=None, specials=SPECIAL_TOKENS): self.lower = lower self.specials = SPECIAL_TOKENS self.pre_id = [] self.post_id = [] self.stopwords = stopwords if prepend_cls and prepend_bos: raise ValueError("prepend_bos and prepend_cls are" " mutually exclusive") if prepend_cls: self.pre_id.append(self.specials.CLS.value) if prepend_bos: self.pre_id.append(self.specials.BOS.value) if append_eos: self.post_id.append(self.specials.EOS.value) self.punct = WordPunctTokenizer() def __call__(self, x): if self.lower: x = x.lower() x = (self.pre_id + self.punct.tokenize(x) + self.post_id) if self.stopwords: x = [w for w in x if w not in self.stopwords] return x
def clean_tweet(tweet): link_removed = re.sub('https?://[A-Za-z0-9./]+', '', tweet) number_removed = re.sub('[^a-zA-Z]', ' ', link_removed) lower_case_tweet = number_removed.lower() tok = WordPunctTokenizer() words = tok.tokenize(lower_case_tweet) clean_tweet = (' '.join(words)).strip() return clean_tweet
def sentence2words(sentence): result = [] word_punct_tokenizer = WordPunctTokenizer() words = word_punct_tokenizer.tokenize(sentence) stemmer = nltk.stem.SnowballStemmer('english') for word in words: ori_word = stemmer.stem(word) result.append(ori_word) return result
def _tokenize(self, doc): all_tokens = [] sentences = sent_tokenize(doc) tokenizer = WordPunctTokenizer() for sentence in sentences: words = tokenizer.tokenize(sentence.lower()) words = [word for word in words if word not in punctuation] all_tokens.extend(words) return all_tokens
def load_task2(articles_path, labels_path, tokenizer='punct'): file_names, labels, spans = get_class_labels(labels_path) corpus = load_data(articles_path) tknz = WordPunctTokenizer() samples = [] for span, file_name in zip(spans, file_names): article = corpus[file_name] tokenized_span = tknz.tokenize(article[span[0]:span[1]]) samples.append(tokenized_span) return samples, labels, spans, file_names
def _sentence_tok(delex_texts: List[str]) -> List[List[List[str]]]: #tokenize the texts sentence_tok_texts = [] tknzr = WordPunctTokenizer() for text in delex_texts: sentences = sent_tokenize(text) tok_sentences = [] for sentence in sentences: tok_sentences.append(tknzr.tokenize(sentence)) sentence_tok_texts.append(tok_sentences) return sentence_tok_texts
class CustomTokenizer: def __init__(self, unicode_to_ascii=True, punct_one_token_per_char=True): self.unicode_to_ascii = unicode_to_ascii self.punct_one_token_per_char = punct_one_token_per_char self._re_punct = re.compile("(\p{P})") self._tokenizer = WordPunctTokenizer() def tokenize(self, text): if self.unicode_to_ascii: text = unidecode(text) if self.punct_one_token_per_char: text = re.sub(self._re_punct, "\\1 ", text) return self._tokenizer.tokenize(text)
def stemming_words(text): wpt = WordPunctTokenizer() words = wpt.tokenize(text) turkishStemmer = TurkishStemmer() stemmed_words = [] for word in words: stemmed_words.append(turkishStemmer.stemWord(word)) text = ' '.join([str(word) for word in stemmed_words]) # print (stemmed_words) return text
def stemming_words(self, text): wpt = WordPunctTokenizer() words = wpt.tokenize(text) turkishStemmer = TurkishStemmer() stemmed_words = [] for word in words: stemmed_words.append(turkishStemmer.stemWord(word)) # try: # # stemmed_words.append(turkishStemmer.stemWord(word)) # stemmed_words.append(word[0:5]) # except: # # stemmed_words.append(turkishStemmer.stemWord(word)) # stemmed_words.append(word) text = ' '.join([str(word) for word in stemmed_words]) return text
class TolstojParser(HTMLParser): def __init__(self): HTMLParser.__init__(self) self.inside_dd = False self.bgrams = {} self.sorted_bgrams = [] self.tokenizer = WordPunctTokenizer() self.token_count = 0 def handle_starttag(self, tag, attrs): if tag == "dd": self.inside_dd = True else: self.inside_dd = False def handle_data(self, data): if self.inside_dd: tokens = self.tokenizer.tokenize(unicode(data, 'utf-8').lower()) for t1, t2 in itertools.izip(tokens, tokens[1:]): self.token_count += 1 if (t1[0] in string.punctuation) or (t2[0] in string.punctuation): continue key = t1.encode('utf-8') + ' ' + t2.encode('utf-8') if self.bgrams.has_key(key): self.bgrams[key] += 1 else: self.bgrams[key] = 1 def dump_bgrams(self, output_name): output = open(output_name, 'wb') pickle.dump(self.bgrams, output) output.close() def make_sorted_bgrams(self): self.sorted_bgrams = sorted(self.bgrams.items(), key=lambda x: x[1], reverse=True) def print_sorted_bgrams(self): for key, count in self.sorted_bgrams: print key, count
def get_average_embedding(embedding, review): """ returns a list of word vectors for all words in review then average them to return a final vector :param embedding: embedding object - will be either Fasttext or Word2Vec :param review: review text :return: """ log.debug(f'Getting average embedding for: [{review}]') wpt = WordPunctTokenizer() # word_vectors = [embedding.wv.get_vector(word) for word in wpt.tokenize(review)] word_vectors = [embedding.wv.get_vector(word) for word in wpt.tokenize(review) if word in embedding.wv.vocab] log.debug(f'word_vector shape [{np.shape(word_vectors)}]') # return average all word vectors to come up with final vector for the review # since we are using pre-trained embedding, we may not be able to find all the words if np.shape(word_vectors)[0] > 1: return np.average(word_vectors, axis=0) return None
def generate_fasttext_file(x: pd.DataFrame, y: pd.Series, description: str, feature_column: str, timer: Timer = None, feature_size: int = 100, window_context: int = 5, min_word_count: int = 5, sample: float = 0.001, iterations: int = 5, ): """ generate features using fasttext embedding https://radimrehurek.com/gensim/models/fasttext.html :param x: :param y: :param description: :param feature_size: Dimensionality of the word vectors :param window_context: The maximum distance between the current and predicted word within a sentence :param min_word_count: The model ignores all words with total frequency lower than this :param sample: The threshold for configuring which higher-frequency words are randomly downsampled, useful range is (0, 1e-5). :param iterations: Number of iterations (epochs) over the corpus :return: """ log.info("generating fasttext") log.debug(f'{x.head()}') wpt = WordPunctTokenizer() if timer: timer.start_timer(TOKENIZE_TIME_MIN) documents = [wpt.tokenize(review) for review in x.array] if timer: timer.end_timer(TOKENIZE_TIME_MIN) if timer: timer.start_timer(VECTORIZE_TIME_MIN) # TODO: add in configuration for pre-trained # if x.shape[0] <= 50: ft_model = FastText(documents, size=int(feature_size), window=int(window_context), min_count=int(min_word_count), sample=sample, iter=int(iterations) ) # else: # log.info("Download pre-trained fasttext") # ft_model = FastText.load_fasttext_format('wiki.simple') if timer: timer.end_timer(VECTORIZE_TIME_MIN) model_file = f"{MODEL_DIR}/{description}-{len(x)}-{feature_size}.model" log.info(f'Writing model file: {model_file}') if timer: timer.start_timer(MODEL_SAVE_TIME_MIN) ft_model.save(model_file) if timer: timer.end_timer(MODEL_SAVE_TIME_MIN) if timer: timer.start_timer(FEATURE_TIME_MIN) feature_df = get_feature_df(ft_model, x) if timer: timer.end_timer(FEATURE_TIME_MIN) return write_to_file(feature_df, y, feature_column, description, include_lda=False)
class KareninaParser(HTMLParser): def __init__(self): HTMLParser.__init__(self) self.inside_dd = False self.doc_id = 0 self.token_count = 0 self.token_sum_len = 0 self.iindex = {} self.paragraphs = [] self.tokenizer = WordPunctTokenizer() self.stemmer = RussianStemmer() def handle_starttag(self, tag, attrs): if tag == "dd": self.inside_dd = True self.doc_id += 1 else: self.inside_dd = False def handle_data(self, data): if self.inside_dd: self.paragraphs.append(data) terms = set() for token in self.tokenizer.tokenize(unicode(data.lower(), 'utf-8')): if token[0] in string.punctuation: continue self.token_count += 1 self.token_sum_len += len(token) term = self.stemmer.stem(token) if not term in terms: terms.add(term) if self.iindex.has_key(term): self.iindex[term].append(self.doc_id) else: self.iindex[term] = [ self.doc_id ] def dump_iindex(self, output_name): output = open(output_name, 'wb') pickle.dump(self.iindex, output) output.close() def dump_paragraphs(self, output_name): output = open(output_name, 'wb') pickle.dump(self.paragraphs, output) output.close() def get_stat(self): term_sum_len = 0 for term in self.iindex.keys(): term_sum_len += len(term) term_count = len(self.iindex.keys()) if not (term_count and self.token_count): self.stat = {} else: self.stat = { 'token_count': self.token_count, 'token_avg_len': self.token_sum_len/float(self.token_count), 'term_count': term_count, 'term_avg_len': term_sum_len/float(term_count) } return self.stat def print_iindex(self): for term in sorted(self.iindex.keys()): posting_list = self.iindex[term] print term print len(posting_list) print posting_list print '---------------------'
class Decompounder(object): """Word decompunder.""" def __init__(self): """Set up map.""" self.word_tokenizer = WordPunctTokenizer() filename = join(split(__file__)[0], 'data', 'compounds.txt') self.decompound_map = {} with open(filename, encoding='utf-8') as fid: for line in fid: parts = line.strip().split('|') compound = "".join(parts) decompounded_parts = [ part for part in parts if part != 's' and part != 'e' ] decompounded = " ".join(decompounded_parts) self.decompound_map[compound] = decompounded def decompound_text(self, text): """Return decompounded text. Parameters ---------- text : str Text as a (unicode) str. Returns ------- decompounded : str String with decompounded parts separated by a whitespace. Examples -------- >>> decompounder = Decompounder() >>> text = 'Det er en investeringsvirksomhed' >>> decomp = decompounder.decompound_text(text) >>> decomp == 'det er en investering virksomhed' True """ tokens = self.word_tokenizer.tokenize(text) return " ".join( self.decompound_word(token.lower()) for token in tokens) def decompound_word(self, word): """Return decompounded word. Parameters ---------- word : str Word as a (unicode) str. Returns ------- decompounded : str String with decompounded parts separated by a whitespace. Examples -------- >>> decompounder = Decompounder() >>> decomp = decompounder.decompound_word('investeringsvirksomhed') >>> decomp == 'investering virksomhed' True """ return self.decompound_map.get(word, word)
def sentence2words(self, sentence): result = [] word_punct_tokenizer = WordPunctTokenizer() words = word_punct_tokenizer.tokenize(sentence) return words
ngrams = [] n, m = 0, 0 t = int(time()) l = len(messages) for message in messages: if message == "<|BEGIN|>": ngram = [] elif message == "<|END|>": phrases = [] for phrase in ngram: terms = set(te(phrase, strings=1, nested=1)) words = list( set([ma.parse(w)[0].normal_form for w in wpt.tokenize(phrase)])) idx = [] for word in words: w = 1 if word in terms else .5 idx += [(w, word)] phrases += [(idx, phrase)] ngrams += [phrases] else: ngram += [message] n += 1 if time() - t > 1: print("%s of %s, %s / sec" % (m, l, n)) m += n n = 0 t = int(time())
class FeatureExtractor(BaseEstimator): """Feature extractor for Danish texts.""" def __init__(self): """Set up text processors.""" self.afinn = Afinn(language='da') self.word_tokenizer = WordPunctTokenizer() def partial_fit(self, Y, y=None): """Fit model. This is a dummy function. """ return self def fit(self, X, y=None): """Fit model. This is a dummy function. """ return self @property def features_(self): """Set up features.""" features = [ 'n_characters', 'n_words', 'n_unique_words', 'afinn_sum_valence', 'afinn_sum_arousal', 'afinn_sum_ambiguity' ] return features def transform(self, raw_documents, y=None): """Transform documents to features. Parameters ---------- raw_documents : iterable over str Iterable with corpus to be transformed. y : numpy.array Target (not used, dummy parameter). """ X = [] for n, document in enumerate(raw_documents): words = self.word_tokenizer.tokenize(document) unique_words = set(words) scores = self.afinn.scores(document) sum_valence = sum(scores) sum_arousal = np.sum(np.abs(scores)) X.append([ len(document), len(words), len(unique_words), sum_valence, sum_arousal, sum_arousal - abs(sum_valence) ]) X = np.array(X) return X fit_transform = transform
def generate_word2vec_file(x: pd.DataFrame, y: pd.Series, description: str, feature_column: str, timer: Timer = None, feature_size: int = 100, window_context: int = 5, min_word_count: int = 5, sample: float = 0.001, iterations: int = 5, ): """ generate features using word2vec :param x: :param y: :param description: :param feature_size: :param window_context: :param min_word_count: :param sample: :param iterations: :return: """ log.info("generating word2vec") log.debug(f'{x.head()}') wpt = WordPunctTokenizer() if timer: timer.start_timer(TOKENIZE_TIME_MIN) documents = [wpt.tokenize(review) for review in x.array] if timer: timer.end_timer(TOKENIZE_TIME_MIN) if timer: timer.start_timer(VECTORIZE_TIME_MIN) # TODO: add configuraton for pre-trained or train # if x.shape[0] <= 50: w2v_model = Word2Vec(documents, size=int(feature_size), window=int(window_context), min_count=int(min_word_count), sample=sample, iter=int(iterations) ) # else: # log.info("Downloading pre-trained word2vec") # w2v_model = api.load("word2vec-google-news-300") if timer: timer.end_timer(VECTORIZE_TIME_MIN) model_file = f"{MODEL_DIR}/{description}-{len(x)}-{feature_size}.model" log.info(f'Writing model file: {model_file}') if timer: timer.start_timer(MODEL_SAVE_TIME_MIN) w2v_model.save(model_file) if timer: timer.end_timer(MODEL_SAVE_TIME_MIN) feature_df = get_feature_df(w2v_model, x) return write_to_file(feature_df, y, feature_column, description, include_lda=False)
def evaluate(self): path = self.dataset.getPath() try: features = pickle.load(open(f"{path}/preprocessed.p", "rb")) except: features = self.processor.process() pickle.dump(features, open(f"{path}/preprocessed.p", "wb")) word_punctuation_tokenizer = WordPunctTokenizer() word_tokenized_corpus = [ word_punctuation_tokenizer.tokenize(sent) for sent in features] # print(word_tokenized_corpus) embedding_size = 64 window_size = 3 min_word = 5 down_sampling = 1e-2 ft_model = FastText(word_tokenized_corpus, size=embedding_size, window=window_size, min_count=min_word, sample=down_sampling, sg=1, iter=100) # pickle.dump(ft_model, open("ft_model.p", "wb")) # ft_model = pickle.load(open("ft_model.p", "rb")) # print(ft_model.wv['gün']) embedding_matrix = np.zeros((len(ft_model.wv.vocab) + 1, 64)) for i, vec in enumerate(ft_model.wv.vectors): embedding_matrix[i] = vec vocab_size = len(ft_model.wv.vocab)+1 # semantically_similar_words = {words: [item[0] for item in ft_model.wv.most_similar( # [words], topn=5)]for words in ['gün', 'katil', 'ekonomi', 'haber', 'başbakan', 'siyaset']} # for k, v in semantically_similar_words.items(): # print(k+":"+str(v)) # # print(ft_model.wv.similarity(w1='siyaset', w2='futbol')) # from sklearn.decomposition import PCA # all_similar_words = sum( # [[k] + v for k, v in semantically_similar_words.items()], []) # # print(all_similar_words) # # print(type(all_similar_words)) # # print(len(all_similar_words)) # word_vectors = ft_model.wv[all_similar_words] # pca = PCA(n_components=2) # p_comps = pca.fit_transform(word_vectors) # word_names = all_similar_words # plt.figure(figsize=(18, 10)) # plt.scatter(p_comps[:, 0], p_comps[:, 1], c='red') # for word_names, x, y in zip(word_names, p_comps[:, 0], p_comps[:, 1]): # plt.annotate(word_names, xy=(x+0.06, y+0.03), # xytext=(0, 0), textcoords='offset points') # plt.show() labels = self.dataset.getClasses() le = preprocessing.LabelEncoder() labels = le.fit_transform(labels) labels = to_categorical(labels) return self.ft_model(features, labels, embedding_matrix, vocab_size, ft_model)
def tokenize(text_array, use_pos=False, data_type=None, lang=None): """ Given an array of sentences, returns: If use_pos: An array of tokenised sentences (where each tokenised sentence is an array of tokens) else: An array of tokenised sentences (where each tokenised sentence is an array of tuples of (token, POS tag)) NOTE: If use_pos is False, the rest of the kwargs are ignored """ if use_pos: # Since POS tags take long to generate, use cached version if exists cache_path = None if data_type == DatasetType.TRAIN: cache_path = os.path.join(SAVED_POS_BASE_PATH, f'train-{lang}-pos.pickle') elif data_type == DatasetType.VAL: cache_path = os.path.join(SAVED_POS_BASE_PATH, f'val-{lang}-pos.pickle') elif data_type == DatasetType.TEST: cache_path = os.path.join(SAVED_POS_BASE_PATH, f'test-{lang}-pos.pickle') if os.path.isfile(cache_path): with open(cache_path, 'rb') as handle: sentences = pickle.load(handle) return sentences tokeniser = WordPunctTokenizer() sentences = [] with tqdm(total=len(text_array)) as pbar: for sentence in text_array: tokens = tokeniser.tokenize(sentence) lower_cased_tokens = [] for tok in tokens: tok_lower = tok.lower() lower_cased_tokens.append(tok_lower) if use_pos: # Store tokenised sentence i.e. arrays of (token, POS_TAG) tuples try: sentences.append(get_pos_tags(lower_cased_tokens, lang)) except: sentences.append([ get_pos_tags([tok], lang)[0] for tok in lower_cased_tokens ]) else: # Store tokenised sentence sentences.append(lower_cased_tokens) pbar.update(1) if use_pos: # Store POS tags to allow faster loading on next invocation with open(cache_path, 'wb') as handle: pickle.dump(sentences, handle) return sentences
class SentenceReader: def __init__(self, thesaurus, need_deeppavlov=True, deeppavlov_model=None, need_syntax=True, syntax_model=None): self.need_deeppavlov = need_deeppavlov if need_deeppavlov: self.deeppavlov_lemma = deeppavlov_model if deeppavlov_model else build_model( configs.morpho_tagger.BERT.morpho_ru_syntagrus_bert, download=False) if need_syntax: self.syntax_model = syntax_model if syntax_model else build_model( configs.syntax.syntax_ru_syntagrus_bert, download=False) else: self.syntax_model = None self.tokenizer = WordPunctTokenizer() self.thesaurus = thesaurus def process_file(self, filename, verbose=False): tagged_lemmas = [] initial_sentences = [] # Stats for output broken_sentences = 0 failed_lemmatize = 0 with open(filename) as tagged_file: current_sentence_tokens = [] current_sentence_lemmas = [] need_append = False for line in tagged_file.readlines(): if line.startswith("# sent_id"): need_append = True elif line.startswith("# text"): continue elif len(line) < 2: sentences_lemma_divided = self.divide_tagged( current_sentence_lemmas) sentence_initial_divided = self.divide_tagged( current_sentence_tokens) tagged_lemmas += sentences_lemma_divided initial_sentences += sentence_initial_divided broken_sentences += (len(sentences_lemma_divided) - 1) need_append = False current_sentence_tokens = [] current_sentence_lemmas = [] else: if need_append: line_splitted = line.split('\t') current_sentence_tokens.append( line_splitted[1].lower()) current_sentence_lemmas.append( line_splitted[2].lower()) parsed_sentences = [] for init_tokens, lemma_tokens in zip(initial_sentences, tagged_lemmas): deeppavlov_lemma = None deeppavlov_pos = None if self.need_deeppavlov: try: deeppavlov_lemma, deeppavlov_pos = self.get_deeppavlov_info( init_tokens) except: failed_lemmatize += 1 deeppavlov_lemma = None deeppavlov_pos = None parsed_sentences.append( ParsedSentence(init_tokens, lemma_tokens, self.thesaurus, deeppavlov_lemma, deeppavlov_pos, self.syntax_model)) if verbose: print("Processed {}. Recovered {} sentences, lost {} too long". format(filename, broken_sentences, failed_lemmatize)) return parsed_sentences def process_directory(self, dir_path, verbose=False): text_names = listdir(dir_path) all_sentences = [] for filename in text_names: full_path = join(dir_path, filename) parsed_sentences = self.process_file(full_path) all_sentences += parsed_sentences return all_sentences def divide_tagged(self, tagged_sentence): single_sentence = " ".join(tagged_sentence) sentence_parts = single_sentence.split(".") return [ self.tokenizer.tokenize(part) + ["."] for part in sentence_parts if len(part) > 0 ] def get_deeppavlov_info(self, tagged_sentence): sentences = [tagged_sentence] morpho_tokens = self.deeppavlov_lemma(sentences)[0].split('\n') splitted_info = [x.split('\t') for x in morpho_tokens] lemmatized_tokens = [ splitted[2] for splitted in splitted_info if len(splitted) == 10 ] pos = [ splitted[3] for splitted in splitted_info if len(splitted) == 10 ] return lemmatized_tokens, pos
from nltk import PunktSentenceTokenizer, WordPunctTokenizer from collections import Counter vocab_size = 1000 sentTokenier = PunktSentenceTokenizer() wordTokenizer = WordPunctTokenizer() filename = 'data/formatted_movie_lines.txt' string = open(filename, mode='r', encoding='utf8').read() string = string.replace("'t", "") string = string.replace("'s", "") words = wordTokenizer.tokenize(string) sentences = set(sentTokenier.tokenize(string)) vocab = Counter(words).most_common(vocab_size) dict = Counter(vocab) sentences = [wordTokenizer.tokenize(sentence) for sentence in sentences] new_sentences = [] with open("lines.txt", mode='w', encoding='utf8') as file: for sentence in sentences: write = True for word in sentence: if word in dict.keys(): write = False break if write: file.writelines(" ".join(sentence) + "\n") new_sentences.append(sentence)
from gensim import corpora, models, similarities from nltk import WordPunctTokenizer import re NUM_TOPICS = 40 stopwords = open('stopwords.txt').read().split('\n') word_re = re.compile('[a-z0-9\s]+') tokenizer = WordPunctTokenizer() tokenize = lambda text: [w.lower() for w in tokenizer.tokenize(text) if re.match(word_re, w.lower()) and w.lower() not in stopwords] id2word = corpora.Dictionary.load('dictionary.dict') mm = corpora.MmCorpus('tfidf.mm') lsi = models.lsimodel.LsiModel(corpus=mm, id2word=id2word, num_topics=NUM_TOPICS) dic = corpora.Dictionary.load('dictionary.dict') def get_topics(text, num, model=lsi): """ get +num+ topics for text +text+ """ topics = [] for t in sorted(model[dic.doc2bow(tokenize(text))], key=lambda t: t[1], reverse=True)[:num]: topics.append([u[1] for u in lsi.show_topic(t[0])]) return topics
def finder(query): from nltk.corpus import stopwords from nltk.tokenize import word_tokenize stop_words = [ "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now" ] dataset = pd.read_csv("abs.csv") data = dataset[['Medline_No', 'Abstract']] from nltk import WordPunctTokenizer tokenizer = WordPunctTokenizer() descriptions = [ tokenizer.tokenize(description.lower()) for description in data["Abstract"] if not description in stop_words ] from gensim import corpora corpora_dict = corpora.Dictionary(descriptions) #print(corpora_dict.token2id) #Basic model which needs improvment corpus = [corpora_dict.doc2bow(text) for text in descriptions] from gensim import similarities index_bow = similarities.SparseMatrixSimilarity( corpus, num_features=len(corpora_dict)) from gensim.models import TfidfModel model_tfidf = TfidfModel(corpus) vector = model_tfidf[corpus[0]] corpus_tfidf = model_tfidf[corpus] index_tfidf = similarities.SparseMatrixSimilarity( corpus_tfidf, num_features=len(corpora_dict)) def search(index, query, top_n=10, prints=False): """ This function searches the most similar texts to the query. :param index: gensim.similarities object :param query: a string :param top_n: how many variants it returns :param prints: if True returns the results, otherwise prints the results :returns: a list of tuples (matched_document_index, similarity_value) """ # getting a BoW vector bow_vec = corpora_dict.doc2bow(query.lower().split()) similarities = index[ bow_vec] # get similarities between the query and all index documents similarities = [(x, i) for i, x in enumerate(similarities)] similarities.sort(key=lambda elem: -elem[0] ) # sorting by similarity_value in decreasing order res = [] if prints: print(f"{query}\n") for result in similarities[:top_n]: if prints: #print(f"{data['main_speaker'][result[1]]} \n{data['description'][result[1]]} \t {result[0]} \t \n") if result[0] > 0: dic = { 'Medline_No': data['Medline_No'][result[1]], 'Title': data_dict_title[data['Medline_No'][result[1]]] } myresult.append(dic) else: res.append((result[1], result[0])) if not prints: return res myresult = [] search(index_tfidf, query, prints=True) return myresult
class DataPreparator: def __init__(self, input_size, batch_size, path_to_write): self.word_vectors = FastText.load("D:\\Typing\\araneum_none_fasttextskipgram_300_5_2018.model") self.input_size = input_size self.tokenizer = WordPunctTokenizer() self.batch_size = batch_size self.path = path_to_write self.punctuations = ['.', ',', '-', '\'', '\"', '!', '?', '(', ')', ':', ';'] def define_word_vector(self): num = 0 dir = "D:\\Typing\\texts\\" prefix = "{http://www.gribuser.ru/xml/fictionbook/2.0}" files = os.listdir(dir) inputs = [] outputs = [] for file in files: tree = ET.parse(dir + file) root = tree.getroot() for child in root.iter(prefix + 'p'): text = child.text if text is None: continue for line in text.split("."): for char in line: if char in self.punctuations: line = line.replace(char, '') words = self.tokenizer.tokenize(line) for i in range(len(words)-5): try: input = (self.word_vectors[words[i]], self.word_vectors[words[i+1]], self.word_vectors[words[i+2]]) output = (self.word_vectors[words[i+3]]) except KeyError: continue inputs.append(input) outputs.append(output) if len(outputs) == self.batch_size: with open(self.path + str(num), 'w') as f: for k in range(len(outputs)): f.write(self.vectors_to_string(inputs[k])+':'+self.vectors_to_string(outputs[k])+'\n') print str(num) num += 1 inputs = [] outputs = [] def define_freq_word(self, n=1000): num = 0 self.freq_words = self.load_freq_words(n) dir = "D:\\Typing\\texts\\" prefix = "{http://www.gribuser.ru/xml/fictionbook/2.0}" files = os.listdir(dir) inputs = [] outputs = [] for file in files: tree = ET.parse(dir + file) root = tree.getroot() for child in root.iter(prefix + 'p'): text = child.text if text is None: continue for line in text.split("."): for char in line: if char in self.punctuations: line = line.replace(char, '') words = self.tokenizer.tokenize(line) for i in range(len(words) - 5): if words[i+3] in self.freq_words.keys(): try: input = (self.word_vectors[words[i]], self.word_vectors[words[i + 1]], self.word_vectors[words[i + 2]]) except KeyError: continue output = np.zeros(n) output[self.freq_words[words[i+3]]] = 1 inputs.append(input) outputs.append(output) if len(outputs) == self.batch_size: with open(self.path + str(num), 'w') as f: for k in range(len(outputs)): f.write(self.vectors_to_string(inputs[k]) + ':' + self.vectors_to_string( outputs[k]) + '\n') print str(num) num += 1 inputs = [] outputs = [] if num == 85000: return def load_freq_words(self, n): words = {} counter = 0 with io.open('D:\\Typing\\freq_words.txt', 'r', encoding='utf-8') as f: w = f.read().split('\n') for word in w: if counter < n: words[word] = counter counter += 1 else: return words def count_freq_words(self): dir = "D:\\Typing\\texts_1\\" prefix = "{http://www.gribuser.ru/xml/fictionbook/2.0}" files = os.listdir(dir) counter = Counter() n = 0 for file in files: print str(n) n += 1 tree = ET.parse(dir + file) root = tree.getroot() for child in root.iter(prefix + 'p'): text = child.text if text is None: continue for line in text.split("."): for char in line: if char in self.punctuations: line = line.replace(char, '') words = self.tokenizer.tokenize(line) for word in words: counter[word.lower()] +=1 with io.open('D:\\Typing\\freq_words.txt', 'w', encoding='utf-8') as f: for w in counter.most_common(len(counter)): f.write(w[0]+u'\n') def vectors_to_string(self, vectors): s = '' if isinstance(vectors, tuple): for vector in vectors: for element in vector: s += str(element) + ',' else: for element in vectors: s += str(element) + ',' return s[:len(s) - 1]
from dasem.fullmonty import Word2Vec from dasem.text import Decompounder from cvrminer.cvrmongo import CvrMongo from cvrminer.text import PurposeProcessor from cvrminer.virksomhed import Virksomhed # Ignore broken pipe errors signal.signal(signal.SIGPIPE, signal.SIG_DFL) decompounder = Decompounder() purpose_processor = PurposeProcessor() w2v = Word2Vec() word_tokenizer = WordPunctTokenizer() n = 1 cvr_mongo = CvrMongo() for company in cvr_mongo.iter_companies(): virksomhed = Virksomhed(company) purposes = virksomhed.formaal for purpose in purposes: cleaned_purpose = purpose_processor.clean(purpose) words = word_tokenizer.tokenize(cleaned_purpose) for word in words: word = word.lower() if word not in w2v.model: phrase = decompounder.decompound_word(word) for subphrase in phrase.split(' '): if subphrase not in w2v.model: write(1, subphrase.encode('utf-8') + b('\n'))