def get_words_by_content(content): words = [] tokenizer = SpaceTokenizer() words += tokenizer.tokenize(content) ##words = list(set(words)) # words = frozenset(words) return words
def generate(sents, c, dictionary): ''' args : npndarray of strings return : context-target pair as list ''' tokenizer = SpaceTokenizer() xs = [] ys = [] for sent in sents: sent = tokenizer.tokenize(sent) start = c end = len(sent) for i in range(start, end - c): context = [] for j in range(-c, c + 1): if j == 0: pass else: context.append(dictionary.word2idx[sent[i + j]]) xs.append(context) ys.append(dictionary.word2idx[sent[i]]) x = np.vstack(xs) y = np.vstack(ys) return x, y
def __init__(self, db): GraphBuilder.__init__(self, db) if self._domain == u'Microblog': self._tokenizer = TweetTokenizer() else: self._tokenizer = SpaceTokenizer()
def __init__(self, db): GraphBuilder.__init__(self, db) self._author_guid_posts_dict = {} self._author_guid_bag_of_words_dict = {} self._word_dict = {} if self._domain == u'Microblog': self._tokenizer = TweetTokenizer() else: self._tokenizer = SpaceTokenizer()
def get_tokenizer(tokenizer_name): log.info(f"\tLoading Tokenizer {tokenizer_name}") if tokenizer_name.startswith("bert-"): do_lower_case = tokenizer_name.endswith("uncased") tokenizer = BertTokenizer.from_pretrained(tokenizer_name, do_lower_case=do_lower_case) elif tokenizer_name.startswith("roberta-"): tokenizer = RobertaTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("xlnet-"): do_lower_case = tokenizer_name.endswith("uncased") tokenizer = XLNetTokenizer.from_pretrained(tokenizer_name, do_lower_case=do_lower_case) elif tokenizer_name.startswith("openai-gpt"): tokenizer = OpenAIGPTTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("gpt2"): tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("transfo-xl-"): # TransformerXL is trained on data pretokenized with MosesTokenizer tokenizer = MosesTokenizer() elif tokenizer_name.startswith("xlm-"): tokenizer = XLMTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name == "MosesTokenizer": tokenizer = MosesTokenizer() elif tokenizer_name == "SplitChars": tokenizer = SplitCharsTokenizer() elif tokenizer_name == "": tokenizer = SpaceTokenizer() else: tokenizer = None return tokenizer
def get_tokenizer(tokenizer_name): log.info(f"\tLoading Tokenizer {tokenizer_name}") if tokenizer_name.startswith("bert-"): from pytorch_pretrained_bert import BertTokenizer do_lower_case = tokenizer_name.endswith("uncased") tokenizer = BertTokenizer.from_pretrained(tokenizer_name, do_lower_case=do_lower_case) elif tokenizer_name == "OpenAI.BPE": tokenizer = OpenAIBPETokenizer() elif tokenizer_name == "MosesTokenizer": tokenizer = MosesTokenizer() elif tokenizer_name == "": tokenizer = SpaceTokenizer() else: tokenizer = None return tokenizer
from io import StringIO from typing import Iterable, List, NewType, Sequence, Text, Tuple, Type, Union import numpy as np from nltk.tokenize.simple import SpaceTokenizer from scipy import sparse # Use https://pypi.org/project/python-Levenshtein/ for fast alignment. # install with: pip install python-Levenshtein from Levenshtein.StringMatcher import StringMatcher from jiant.utils.tokenizers import get_tokenizer, Tokenizer from jiant.utils.utils import unescape_moses # Tokenizer instance for internal use. _SIMPLE_TOKENIZER = SpaceTokenizer() _SEP = " " # should match separator used by _SIMPLE_TOKENIZER # Type alias for internal matricies Matrix = NewType("Matrix", Union[Type[sparse.csr_matrix], Type[np.ndarray]]) _DTYPE = np.int32 def _mat_from_blocks_dense(mb, n_chars_src, n_chars_tgt): M = np.zeros((n_chars_src, n_chars_tgt), dtype=_DTYPE) for i in range(len(mb)): b = mb[i] # current block # Fill in-between this block and last block if i > 0: lb = mb[i - 1] # last block
class GraphBuilder_Bag_Of_Words(GraphBuilder): """The edge between two authors is the jaccard similarity between the bag of words of each author""" def __init__(self, db): GraphBuilder.__init__(self, db) if self._domain == u'Microblog': self._tokenizer = TweetTokenizer() else: self._tokenizer = SpaceTokenizer() def execute(self, window_start=None): start_time = time.time() logging.info("execute started for " + self.__class__.__name__ + " started at " + str(start_time)) logging.info("getting posts from DB ") if self._num_of_random_authors_for_graph is None: posts_by_domain = self._db.get_author_posts_dict_by_minimal_num_of_posts( self._domain, self._min_number_of_posts_per_author) else: # if not self._are_already_randomize_authors_for_graphs(): # self._db.randomize_authors_for_graph(self._min_number_of_posts_per_author, self._domain, self._num_of_random_authors_for_graph) posts_by_domain = self._db.get_random_author_posts_dict_by_minimal_num_of_posts( ) all_authors_count = len(posts_by_domain.keys()) total_combinations = (all_authors_count * (all_authors_count - 1)) / 2 current = 0 # Dictionary: key = author_guid, value = list of posts bag_of_words_per_author = {} for author, posts in posts_by_domain.iteritems(): bow = [] for post in posts: content = post.content if content is not None: bow += self._tokenizer.tokenize(content) bag_of_words_per_author[author] = frozenset(bow) current += 1 if current % 10000 == 0: print('\r done author ' + str(current) + ' out of ' + str(all_authors_count), end='') logging.info("done computing bag of words ") all_pairs = combinations(bag_of_words_per_author.keys(), 2) """ Casting all_pairs to an iterable object (frozenset) is NOT a good idea since combinations function returns a generator object, which is more memory and CPU efficient than iterable objects """ logging.info("computing similarity between bag of words ") author_connections = [] current = 0 for author_a, author_b in all_pairs: weight = self.compute_jaccard_index( bag_of_words_per_author[author_a], bag_of_words_per_author[author_b]) author_connections.append((author_a, author_b, weight)) current += 1 if current % 10000 == 0: print('\r done pair ' + str(current) + ' out of ' + str(total_combinations), end='') self._fill_author_connections(author_connections) author_connections = [] self._fill_author_connections(author_connections) end_time = time.time() duration = end_time - start_time logging.info(" total time taken " + str(duration)) def compute_jaccard_index(self, set_1, set_2): n = len(set_1.intersection(set_2)) return n / float(len(set_1) + len(set_2) - n)
def get_words_by_content(content): words = [] tokenizer = SpaceTokenizer() words += tokenizer.tokenize(content) return words
def __init__(self,): NltkTokenizer.__init__(self) _SpaceTokenizer.__init__(self,)
loss.backward() optimizer.step() # logging if (i % log_interval) == 0 and i != 0: #TODO : # 1) implement training function print("batch_idx {:d} | loss {:.4f}".format( i, total_loss / log_interval)) # 2) wrap up preprocessing -> create dataloader in certain form total_loss = 0 if __name__ == "__main__": tokenizer = SpaceTokenizer() normalize_corpus = np.vectorize(normalize) raw = gutenberg.sents('bible-kjv.txt') start_time = time.time() norm = normalize_corpus(raw[:100]) elapsed = time.time() - start_time # fill out dictionary dictionary = Dictionary() for sent in norm: words = tokenizer.tokenize(sent) for word in words: dictionary.add_word(word) ''' print("length of dict: ", len(dictionary))
def main(): USER = "******" PASSWD = "089567" HOST = "localhost" CHARSET = "utf8mb4" DATABASE = "music" COLLATE = "utf8mb4_unicode_ci" #PORT = "3306" PATH = './' PATH_U = './users' x = sql_worker(USER, PASSWD, HOST, CHARSET, DATABASE, COLLATE) print("Соединение", x.connect()) print("Создание", x.create()) print("Использование", x.use(DATABASE)) print("Ожидайте, обработка базы") dlalbom = [] dlpesnya = [] textsong = [] artist = (x.select("select id,name from wc_lyricsnet_artists limit 1000")) for elem in artist: album = (x.select("select `id`,`name` from `wc_lyricsnet_albums` where `artist_id` = '%s' and `year` = 1995;" % (elem[0]))) for each in album: song = (x.select("select `title` from `wc_lyricsnet_songs` where `artist_id` = '%s' and `album_id` = '%s';" % (elem[0], each[0]))) for ef in song: ef = list(ef) ef = ''.join(map(str, ef)) textsong.append(ef) #csvwww.write(str(elem[1]) + "\t" + str(each[1]) + "\t" + str(ef) + "\n") #создание словаря #print(elem[1]) #print(each[1]) #print(ef) dlinaalbom = len(str(each[1])) dlinasong = len(str(ef)) dlalbom.append(dlinaalbom) dlpesnya.append(dlinasong) xx = np.array(dlalbom) yy = np.array(dlpesnya) pylab.plot(yy, xx, 'r') pylab.xlabel('Количество символов в названии песни') pylab.ylabel('Количество символов в названии альбома') pylab.title('Соотношение длины названий (в символах)') pylab.show() correl = np.corrcoef(yy, xx) print("Корреляция длины названий песен и альбомов") print("Для продолжения закройте диаграмму") print("###") print(correl) plt.scatter(yy, xx) plt.show() print("Корреляция Пирсона", scipy.stats.pearsonr(yy, xx)) print("\n") #LDA texts = [] tokentoken = SpaceTokenizer() #токенизация en_stop = get_stop_words('en') #стоп-слова p_stemmer = PorterStemmer() #стемминг for i in textsong: raw = i.lower() tokens = tokentoken.tokenize(raw) stopped_tokens = [i for i in tokens if not i in en_stop] stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens] texts.append(stemmed_tokens) dic = corpora.Dictionary(texts) corps = [dic.doc2bow(text) for text in texts] ldamodel = gensim.models.ldamodel.LdaModel(corps, num_topics = 7, id2word = dic, passes = 20) try: plt.style.use('ggplot') except: pass print("Результат работы LDA:") print("###") print(ldamodel.print_topics(num_topics = 7, num_words = 3)) print("\n") allwords = [] for elements in texts: for allell in elements: allwords.append(allell) stoplist = set('for a of the and to in'.split()) #LSI texts = [[word for word in document.lower().split() if word not in stoplist] for document in allwords] alltokens = sum(texts, []) tokens1 = set(word for word in set(alltokens) if alltokens.count(word) == 1) texts = [[word for word in text if word not in tokens1] for text in texts] corp = [dic.doc2bow(text) for text in texts] lsi = models.lsimodel.LsiModel(corpus = corp, id2word = dic, num_topics = 3) print("Результат работы LSI:") print("###") print(lsi.print_topics(3)) slovnik = {} with open("csvdict1.csv", 'r') as f: reader = csv.reader(f, delimiter = "\t") for row in reader: kluch1 = row[2].split(" ") for elemk in kluch1: kluch = elemk kluch = str(kluch).lower() if kluch in slovnik: znach = slovnik[kluch] slovnik[kluch] = znach + 1 else: slovnik[kluch] = 1 stopslova = "a, about, above, after, again, against, all, am, an, and, any, are, aren't, as, at, be, because, been, before, being, below, between, both, but, by, can't, cannot, could, couldn't, did, didn't, do, does, doesn't, doing, don't, down, during, each, few, for, from, further, had, hadn't, has, hasn't, have, haven't, having, he, he'd, he'll, he's, her, here, here's, hers, herself, him, himself, his, how, how's, i, i'd, i'll, i'm, i've, if, in, into, is, isn't, it, it's, its, itself, let's, me, more, most, mustn't, my, myself, no, nor, not, of, off, on, once, only, or, other, ought, our, ours ourselves, out, over, own, same, shan't, she, she'd, she'll, she's, should, shouldn't, so, some, such, than, that, that's, the, their, theirs, them, themselves, then, there, there's, these, they, they'd, they'll, they're, they've, this, those, through, to, too, under, until, up, very, was, wasn't, we, we'd, we'll, we're, we've, were, weren't, what, what's, when, when's, where, where's, which, while, who, who's, whom, why, why's, with, won't, would, wouldn't, you, you'd, you'll, you're, you've, your, yours, yourself, yourselves" sortklucha = sorted(slovnik, key = lambda x: int(slovnik[x]), reverse = True) zapis = open("slovnik.csv",'w') zapis2 = open("slovnikstopwords.csv", "w") zapis.write("Word" + "\t" + "Frequency" + "\n") zapis2.write("Word" + "\t" + "Frequency" + "\n") try: for kluch in sortklucha: jjj = str("{0}\t{1}\n").format(kluch, slovnik[kluch]) zapis2.write(jjj) print("Создан частотный словарь") finally: zapis.close() try: for kluch in sortklucha: if kluch in stopslova: pass else: jjj = str("{0}\t{1}\n").format(kluch, slovnik[kluch]) zapis2.write(jjj) print("Создан частотный словарь со стоп-словами") finally: zapis2.close()
import json import nltk import re import os import argparse from nltk.tokenize.simple import SpaceTokenizer tk = SpaceTokenizer() def get_bio_target(opinion): try: text, idxs = opinion["Target"] # will throw exception if the opinion target is None type except TypeError: return [] except ValueError: return [] # get the beginning and ending indices if len(text) > 1: updates = [] # for t, idx in zip(text, idxs): bidx, eidx = idx.split(":") bidx = int(bidx) eidx = int(eidx) polarity = opinion["Polarity"] target_tokens = t.split() label = "-targ-{0}".format(polarity) # tags = []
article["relevance"] = matched / len(article["relevant"]) * \ (100 - (100 - article["reputability"] / 2.0) / 100) Return article with highest relevance score """ from nltk.stem.porter import PorterStemmer from nltk.tokenize.simple import SpaceTokenizer from nltk.corpus import stopwords import string import time from urlparse import urlparse import re stemmer = PorterStemmer() tokenizer = SpaceTokenizer() def tokenize(text): text = "".join(c for c in text if c not in string.punctuation) return tokenizer.tokenize(text) def remove_stopwords(text): filtered_words = [word for word in text if word not in stopwords.words('english')] return filtered_words def sanitize_description(description): return description.replace("Continue reading...", "").replace("\n", " ") # let's work on this... maybe TODO?
def main(): USER = "******" PASSWD = "089567" HOST = "localhost" CHARSET = "utf8mb4" DATABASE = "music" COLLATE = "utf8mb4_unicode_ci" #PORT = "3306" PATH = './' PATH_U = './users' x = sql_worker(USER, PASSWD, HOST, CHARSET, DATABASE, COLLATE) print("Соединение", x.connect()) print("Создание", x.create()) print("Использование", x.use(DATABASE)) print("Ожидайте, обработка базы") dlalbom = [] dlpesnya = [] textsong = [] artist = (x.select("select id,name from wc_lyricsnet_artists limit 1000")) for elem in artist: album = (x.select( "select `id`,`name` from `wc_lyricsnet_albums` where `artist_id` = '%s' and `year` = 1995;" % (elem[0]))) for each in album: song = (x.select( "select `title` from `wc_lyricsnet_songs` where `artist_id` = '%s' and `album_id` = '%s';" % (elem[0], each[0]))) for ef in song: ef = list(ef) ef = ''.join(map(str, ef)) textsong.append(ef) #csvwww.write(str(elem[1]) + "\t" + str(each[1]) + "\t" + str(ef) + "\n") #создание словаря #print(elem[1]) #print(each[1]) #print(ef) dlinaalbom = len(str(each[1])) dlinasong = len(str(ef)) dlalbom.append(dlinaalbom) dlpesnya.append(dlinasong) xx = np.array(dlalbom) yy = np.array(dlpesnya) pylab.plot(yy, xx, 'r') pylab.xlabel('Количество символов в названии песни') pylab.ylabel('Количество символов в названии альбома') pylab.title('Соотношение длины названий (в символах)') pylab.show() correl = np.corrcoef(yy, xx) print("Корреляция длины названий песен и альбомов") print("Для продолжения закройте диаграмму") print("###") print(correl) plt.scatter(yy, xx) plt.show() print("Корреляция Пирсона", scipy.stats.pearsonr(yy, xx)) print("\n") #LDA texts = [] tokentoken = SpaceTokenizer() #токенизация en_stop = get_stop_words('en') #стоп-слова p_stemmer = PorterStemmer() #стемминг for i in textsong: raw = i.lower() tokens = tokentoken.tokenize(raw) stopped_tokens = [i for i in tokens if not i in en_stop] stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens] texts.append(stemmed_tokens) dic = corpora.Dictionary(texts) corps = [dic.doc2bow(text) for text in texts] ldamodel = gensim.models.ldamodel.LdaModel(corps, num_topics=7, id2word=dic, passes=20) try: plt.style.use('ggplot') except: pass print("Результат работы LDA:") print("###") print(ldamodel.print_topics(num_topics=7, num_words=3)) print("\n") allwords = [] for elements in texts: for allell in elements: allwords.append(allell) stoplist = set('for a of the and to in'.split()) #LSI texts = [[ word for word in document.lower().split() if word not in stoplist ] for document in allwords] alltokens = sum(texts, []) tokens1 = set(word for word in set(alltokens) if alltokens.count(word) == 1) texts = [[word for word in text if word not in tokens1] for text in texts] corp = [dic.doc2bow(text) for text in texts] lsi = models.lsimodel.LsiModel(corpus=corp, id2word=dic, num_topics=3) print("Результат работы LSI:") print("###") print(lsi.print_topics(3)) slovnik = {} with open("csvdict1.csv", 'r') as f: reader = csv.reader(f, delimiter="\t") for row in reader: kluch1 = row[2].split(" ") for elemk in kluch1: kluch = elemk kluch = str(kluch).lower() if kluch in slovnik: znach = slovnik[kluch] slovnik[kluch] = znach + 1 else: slovnik[kluch] = 1 stopslova = "a, about, above, after, again, against, all, am, an, and, any, are, aren't, as, at, be, because, been, before, being, below, between, both, but, by, can't, cannot, could, couldn't, did, didn't, do, does, doesn't, doing, don't, down, during, each, few, for, from, further, had, hadn't, has, hasn't, have, haven't, having, he, he'd, he'll, he's, her, here, here's, hers, herself, him, himself, his, how, how's, i, i'd, i'll, i'm, i've, if, in, into, is, isn't, it, it's, its, itself, let's, me, more, most, mustn't, my, myself, no, nor, not, of, off, on, once, only, or, other, ought, our, ours ourselves, out, over, own, same, shan't, she, she'd, she'll, she's, should, shouldn't, so, some, such, than, that, that's, the, their, theirs, them, themselves, then, there, there's, these, they, they'd, they'll, they're, they've, this, those, through, to, too, under, until, up, very, was, wasn't, we, we'd, we'll, we're, we've, were, weren't, what, what's, when, when's, where, where's, which, while, who, who's, whom, why, why's, with, won't, would, wouldn't, you, you'd, you'll, you're, you've, your, yours, yourself, yourselves" sortklucha = sorted(slovnik, key=lambda x: int(slovnik[x]), reverse=True) zapis = open("slovnik.csv", 'w') zapis2 = open("slovnikstopwords.csv", "w") zapis.write("Word" + "\t" + "Frequency" + "\n") zapis2.write("Word" + "\t" + "Frequency" + "\n") try: for kluch in sortklucha: jjj = str("{0}\t{1}\n").format(kluch, slovnik[kluch]) zapis2.write(jjj) print("Создан частотный словарь") finally: zapis.close() try: for kluch in sortklucha: if kluch in stopslova: pass else: jjj = str("{0}\t{1}\n").format(kluch, slovnik[kluch]) zapis2.write(jjj) print("Создан частотный словарь со стоп-словами") finally: zapis2.close()
class Bag_Of_Words_Graph_Builder(GraphBuilder): """The edge between two authors is the jaccard similarity between the bag of words of each author""" def __init__(self, db): GraphBuilder.__init__(self, db) self._author_guid_posts_dict = {} self._author_guid_bag_of_words_dict = {} self._word_dict = {} if self._domain == u'Microblog': self._tokenizer = TweetTokenizer() else: self._tokenizer = SpaceTokenizer() def execute(self, window_start=None): pass def fill_author_guid_posts_dictionary(self): self._author_guid_posts_dict = self._db.get_author_posts_dict_by_minimal_num_of_posts( self._domain, self._min_number_of_posts_per_author) def fill_author_guid_bag_of_words_dictionary(self): all_authors_count = len(self._author_guid_posts_dict.keys()) i = 0 # Dictionary: key = author_guid, value = list of posts for author_guid, posts in self._author_guid_posts_dict.iteritems(): bow = [] for post in posts: content = post.content content = content.lower() content = re.sub(r'http\S+', '', content) if content is not None: bow += self._tokenizer.tokenize(content) bow = list(frozenset(bow)) self._author_guid_bag_of_words_dict[author_guid] = bow for word in bow: if word not in self._word_dict: self._word_dict[word] = word i += 1 if i % 100000 == 0: print('\r done author ' + str(i) + ' out of ' + str(all_authors_count), end='') logging.info("done computing bag of words ") def compute_jaccard_index(self, set_1, set_2): n = len(set_1.intersection(set_2)) return n / float(len(set_1) + len(set_2) - n) def fill_author_guid_bag_of_words_dictionary_and_calculate_all_combinations( self): self.fill_author_guid_bag_of_words_dictionary() author_guids = self._author_guid_bag_of_words_dict.keys() all_authors_count = len(author_guids) all_pairs = combinations(author_guids, 2) total_combinations = (all_authors_count * (all_authors_count - 1)) / 2 """ Casting all_pairs to an iterable object (frozenset) is NOT a good idea since combinations function returns a generator object, which is more memory and CPU efficient than iterable objects """ logging.info("computing similarity between bag of words ") i = 0 for author_guid_1, author_guid_2 in all_pairs: i += 1 print('\r calculating pairs of authors : {0}/{1}'.format( i, total_combinations), end='') author_guid_1_bag_of_words = self._author_guid_bag_of_words_dict[ author_guid_1] author_guid_2_bag_of_words = self._author_guid_bag_of_words_dict[ author_guid_2] self.calculate_jaccard_index_create_and_save_connection( author_guid_1, author_guid_2, author_guid_1_bag_of_words, author_guid_2_bag_of_words) self._db.save_author_connections(self._author_connections_edges) def calculate_jaccard_index_create_and_save_connection( self, author_guid_1, author_guid_2, author_guid_1_bag_of_words, author_guid_2_bag_of_words): weight = self.compute_jaccard_index(set(author_guid_1_bag_of_words), set(author_guid_2_bag_of_words)) self._create_and_optional_save_connection(author_guid_1, author_guid_2, weight)