def analysis_word(self, word): if langid.classify(word)[0] == 'ja': return 'jp' elif langid.classify(word)[0] == 'zh': return 'jp' elif langid.classify(word)[0] == 'en': return 'en' else: return 'en'
def parse_data(data): """ Parse all unique sentences in data. :param data: pandas.DataFrame with text data :returns parsed_data:: pandas.DataFrame with text data """ parser_en = spacy.load('en_core_web_md', disable=['ner', 'textcat']) parser_es = spacy.load('es_core_news_sm', disable=['ner', 'textcat']) # custom tokenizers because duh parser_en.tokenizer = NLTKTokenizerSpacy(parser_en.vocab, TweetTokenizer()) parser_es.tokenizer = NLTKTokenizerSpacy(parser_es.vocab, ToktokTokenizer()) data.loc[:, 'lang'] = data.loc[:, 'txt'].apply(lambda x: langid.classify(x)[0]) parsed_data = [] for i, data_i in data.iterrows(): txt = data_i.loc['txt'] txt = clean_data_for_spacy(txt) sents = sent_tokenize(txt) parsed_data_i = [] for sent in sents: if(data_i.loc['lang'] == 'es'): parse_i = parser_es(sent) else: parse_i = parser_en(sent) # extract tree tree_i = build_parse(parse_i, parse_type='spacy') parsed_data_i.append(tree_i) parsed_data_i = pd.DataFrame(pd.Series(parsed_data_i), columns=['parse']) # logging.debug('processing id %s/%s'%(data_i.loc['id'], int(data_i.loc['id']))) parsed_data_i = parsed_data_i.assign(**{'id' : int(data_i.loc['id'])}) parsed_data.append(parsed_data_i) parsed_data = pd.concat(parsed_data, axis=0) # parsed_data.loc[:, 'id'] = parsed_data.loc[:, 'id'].astype(np.int64) return parsed_data
def translate_with_pronounce(self,word): self.init(word) lang = langid.classify(word)[0] t = None if lang == 'en': self.lang = 'en' elif lang == 'ja' or lang == 'zh': self.lang = 'jp' h = HujiangTranslation() h.init(word) t = h.get_data() else: self.lang = 'en' self.parse_strategy.lang = self.lang self.data = { 'from': self.lang, 'to': 'zh', # 'source':'txt', 'query': self.url.request_key_param } if self.lang != 'zh': if t: return (self.get_data(), t) return (self.get_data(),'') return ('zh','')
def detect_language(text, threshold=0.9): classif = langid.classify(text) if not classif: return u'und' if classif[1] >= threshold: return classif[0] return 'und'
def detect_language(text): classif = langid.classify(text) if not classif: return 'UNK' if classif[1] > 0.9: return classif[0] return 'UNK'
def is_english(str): ''' 返回英语的评论 :param str: :return: ''' if str == None: return None if langid.classify(str)[0] == 'en': # 去掉非英文评论 return str else: return None
def gen_output(data, json_data_dir): term, is_reply, tweets_needed = data dataset = [] # get all user files files = glob.glob(os.path.join(json_data_dir, "*")) random.shuffle(files) for f in files: user = TwitterUser() user.populate_tweets_from_file(f, store_json=True, do_arabic_stemming=False, lemmatize=False) if 50 <= user.n_total_tweets <= 10000 and\ user.followers_count <= 25000 and user.creation_date <= MIN_ACCOUNT_AGE: tweet_set = [t for t in user.tweets if t.retweeted is None and\ len(t.urls) == 0 and 'http:' not in t.text and\ len(t.tokens) > 5 and\ t.created_at >= MIN_TWEET_DATE and\ (term == '' or term in t.tokens) and\ langid.classify(t.text)[0] == 'en'and\ sentiment(t.text)['compound'] != 0] if is_reply: tweet_set = [t for t in tweet_set if t.reply_to] else: tweet_set = [t for t in tweet_set if not t.reply_to] if len(tweet_set) == 0: print 'size 0', term, tweets_needed, is_reply continue tweet = random.sample(tweet_set, 1)[0] print user.screen_name, term, tweets_needed, is_reply, ":::: ", tweet.text dataset.append(tweet) tweets_needed -= 1 if tweets_needed == 0: name = term if term != '' else 'random' name += '_reply' if is_reply else '_non_reply' pickle.dump(dataset, open(name + ".p", 'wb')) print 'done with: ', name, is_reply return else: print 'failed user'
def write_status_txt(tweet_file, out_file_status, out_file_txt, langs=['en']): """ Write status info and text to file. :param tweet_file: tweet input file :param out_file_status: status output file :param out_file_txt: text output file """ status_idx = [x for x in X_KEYS if x != 'text'] with gzip.open(out_file_status, 'wb') as out_file_status_output, open(out_file_txt, 'wb') as out_file_txt_output: with gzip.open(tweet_file, 'r') as tweet_file_input: # try: for i, x in enumerate(tweet_file_input): # for archive_file in archive_dir.filelist: # for i, x in enumerate(archive_dir.open(archive_file)): # check for JSON if(i == 0): x_str = x.decode('utf-8').strip() try: x_test = load_json(x_str) # x_test = literal_eval(x_str) # x_test = json.loads(x_str) # json.loads(x.decode('utf-8').strip()) input_file_json = type(x_test) is dict except Exception as e: logging.debug('json error %s with tweet %s'%(e, str(x))) input_file_json = False # check for dummy .tsv first line if(not input_file_json and x.decode('utf-8').split('\t')[0]=='username'): continue else: x_data = process_line(x, input_file_json) x_data_status = x_data.loc[status_idx].values x_data_txt = clean_txt(x_data.loc['text']) if(x_data_txt == ''): logging.debug('empty text at status %s'%(x)) ## TODO: include original status text in status data file ## only tag valid statuses if(x_data_txt != ''): ## filter for language x_lang = langid.classify(x_data_txt) if(x_lang[0] in langs): x_data_status = np.append(x_data_status, [x_lang[0]]) x_data_status_str = [str(y) for y in x_data_status] out_file_status_output.write(('%s\n'%('\t'.join(x_data_status_str))).encode('utf-8')) out_file_txt_output.write(('%s\n'%(x_data_txt)).encode('utf-8')) if(i % 1000 == 0): logging.debug('processed %d tweets'%(i))
def gen_output(data, json_data_dir): term,is_reply,tweets_needed = data dataset = [] # get all user files files = glob.glob(os.path.join(json_data_dir,"*")) random.shuffle(files) for f in files: user = TwitterUser() user.populate_tweets_from_file(f,store_json=True,do_arabic_stemming=False,lemmatize=False) if 50 <= user.n_total_tweets <= 10000 and\ user.followers_count <= 25000 and user.creation_date <= MIN_ACCOUNT_AGE: tweet_set = [t for t in user.tweets if t.retweeted is None and\ len(t.urls) == 0 and 'http:' not in t.text and\ len(t.tokens) > 5 and\ t.created_at >= MIN_TWEET_DATE and\ (term == '' or term in t.tokens) and\ langid.classify(t.text)[0] == 'en'and\ sentiment(t.text)['compound'] != 0] if is_reply: tweet_set = [t for t in tweet_set if t.reply_to] else: tweet_set = [t for t in tweet_set if not t.reply_to] if len(tweet_set) == 0: print 'size 0', term, tweets_needed, is_reply continue tweet = random.sample(tweet_set, 1)[0] print user.screen_name, term, tweets_needed, is_reply, ":::: ", tweet.text dataset.append(tweet) tweets_needed -= 1 if tweets_needed == 0: name = term if term != '' else 'random' name += '_reply' if is_reply else '_non_reply' pickle.dump(dataset,open(name+".p",'wb')) print 'done with: ',name, is_reply return else: print 'failed user'
def prepare_data(): reviews: List[Reviews] = load_reviews('rozetka.csv') print(f'Total records: {len(reviews)}') filtered: List[Reviews] = [] for r in reviews: if len(r.review) < 32: continue # too short lang = langid.classify(r.review) if lang[0] != 'uk': continue # not ukrainian filtered.append(r) print(f'Filtered records: {len(filtered)}') shuffle(filtered) threshold = int(len(filtered) * 0.7) learn = filtered[:threshold] print(f"Learn records set: {len(learn)}") test = filtered[threshold:] print(f"Test records set: {len(test)}") save_reviews('rozetka_learn.csv', learn) save_reviews('rozetka_test.csv', test)
def parse_data_tweebo(data, tmp_dir='../../data/mined_tweets/'): """ Parse all unique sentences in data, using TweeboParser for English and spacy for Spanish. Assumes that TweeboParser has already been installed. TODO: extract from ugly CoNLL output :param data: pandas.DataFrame with text data :returns parsed_data:: pandas.DataFrame with text data """ ## need to write English data to file to get TweeboParser to run...FML parser_es = spacy.load('es_core_news_sm', disable=['tagger', 'ner', 'textcat']) data.loc[:, 'lang'] = data.loc[:, 'txt'].apply(lambda x: langid.classify(x)[0]) data_es = data[data.loc[:, 'lang'] == 'es'] data_en = data[data.loc[:, 'lang'] != 'es'] ## EN parses # write to temp file en_txt_file = os.path.join(tmp_dir, 'en_parse_tmp.txt') with open(en_txt_file, 'w') as en_txt_out: # clean text txt_clean = data_en.loc[:, 'txt'].apply(lambda x: clean_parse_txt(x)).values en_txt_out.write('\n'.join(txt_clean)) # parse en_txt_parse_file = '%s.predict'%(en_txt_file) parse_command = 'cd TweeboParser && ./run.sh %s'%(en_txt_file) process = subprocess.Popen(parse_command.split(), stdout=subprocess.PIPE) output, err = process.communicate() # read parses parsed_data_en = read_tweeboparse_output(en_txt_parse_file) # remove files os.remove(en_txt_file) os.remove(en_txt_parse_file) ## ES parses parsed_data_es = data_es.loc[:, 'txt'].apply(lambda x: [build_parse(parser_es(y), parse_type='spacy') for y in sent_tokenize(x)])
from langid import langid if __name__ == '__main__': with open("Electronics.txt", "r", encoding="utf-8") as readFile: with open("electronics_filtered.csv", "w", encoding="utf-8", newline="") as writeFile: csvWriter = csv.writer(writeFile) currRow = ["", ""] for line in readFile.readlines(): if line[-1] == "\n": line = line[:-1] if line.startswith("review/score"): currRow[0] = line[13:].strip() if line.startswith("review/text"): currRow[1] = line[12:].strip() csvWriter.writerow(currRow) currRow = ["", ""] with open("electronics_filtered.csv", "r", encoding="utf-8") as filteredReadFile: with open("electronics_filtered_and_smaller.csv", "w", encoding="utf-8", newline="") as filteredWriteFile: csvReader = csv.reader(filteredReadFile) csvWriter = csv.writer(filteredWriteFile) for row in csvReader: if row[0] == 3.0: continue language, score = langid.classify(row[1]) newRow = ["", "", ""] newRow[0] = language newRow[1] = row[0] newRow[2] = row[1] csvWriter.writerow(newRow)
curr_dataset = datasets_to_collect[0] print datasets_to_collect for f in files: user = TwitterUser(filename_for_tweets=f) if user.n_total_tweets < 10000 and user.n_total_tweets > 50 and\ user.followers_count < 25000 and user.creation_date <= MIN_ACCOUNT_AGE: tweet_set = [t for t in user.tweets if t.retweeted is None and\ len(t.urls) == 0 and\ len(t.tokens) > 5 and\ t.created_at <= MIN_TWEET_DATE and\ curr_dataset[0] in t.tokens and\ langid.classify(t.text)[0] == 'en'and\ sentiment(t.text)['compound'] != 0] if len(tweet_set) == 0: continue tweet = random.sample(tweet_set, 1)[0] print user.screen_name, curr_dataset[0:2], ":::: ", tweet.text curr_dataset[2].append(tweet) curr_dataset[1] -= 1 if curr_dataset[1] == 0: pickle.dump(curr_dataset[2], open(curr_dataset[0] + ".p", 'wb')) if len(datasets_to_collect) == 1: print 'DONE!!!' break datasets_to_collect = datasets_to_collect[1:] curr_dataset = datasets_to_collect[0]
from langid import langid s1 = '中文' s2 = 'contenders' s3= 'こにちは' s4 = '規定' s5 = '上げる' s6 = '寝る' l=[] l.append(langid.classify(s1)) l.append(langid.classify(s2)) l.append(langid.classify(s3)) l.append(langid.classify(s4)) l.append(langid.classify(s5)) l.append(langid.classify(s6)) print('https://sp1.baidu.com/5b11fzupBgM18t7jm9iCKT-xh_/sensearch/selecttext?cb=jQuery110205234840516971939_1540970393811&q=content')
def main(): parser = argparse.ArgumentParser() parser.add_argument( '--top_communities', default='../data/community_stats/2015_2016_top_500_communities.txt') parser.add_argument('--years', default='2015') parser.add_argument('--sample_size', default=100) args = parser.parse_args() top_communities = [ l.lower().strip() for l in open(args.top_communities, 'r') ] sample_size = args.sample_size years = args.years.split(',') lang_ctr = {c: defaultdict(float) for c in top_communities} es = Elasticsearch(timeout=TIMEOUT) # split over 2 years ;_; for i, c in enumerate(top_communities): print('querying community #%d = %s' % (i, c)) # collect sample community_post_sample = [] for y in years: # index = 'reddit_comments-%s'%(y) index = 'reddit_comments-%s-2' % (y) # query = {"query": {"match": {"subreddit": c}}} query = { "query": { "constant_score": { "filter": { "term": { "subreddit": c } } } } } results = helpers.scan(es, query=query, index=index) # reservoir sample ctr = 0 for r in results: text = r['_source']['body'] if (text != '[deleted]'): # print(text) # if smaller than sample, add to sample if (len(community_post_sample) < sample_size): community_post_sample.append(text) # otherwise probabilistically replace elif (random() <= REPLACE): replace_index = randint(0, sample_size - 1) community_post_sample[replace_index] = text ctr += 1 if (ctr % 100000 == 0): print('processed %d valid comments' % (ctr)) # convert sample to languages for t in community_post_sample: language, confidence = classify(t) lang_ctr[c][language] += 1 # normalize! for k in lang_ctr[c].keys(): lang_ctr[c][k] /= sample_size # now write to file lang_ctr = pd.DataFrame(lang_ctr) out_dir = os.path.dirname(args.top_communities) out_file = os.path.join(out_dir, 'community_lang_sample.tsv') lang_ctr.to_csv(out_file, sep='\t')
curr_dataset = datasets_to_collect[0] print datasets_to_collect for f in files: user = TwitterUser(filename_for_tweets=f) if user.n_total_tweets < 10000 and user.n_total_tweets > 50 and\ user.followers_count < 25000 and user.creation_date <= MIN_ACCOUNT_AGE: tweet_set = [t for t in user.tweets if t.retweeted is None and\ len(t.urls) == 0 and\ len(t.tokens) > 5 and\ t.created_at <= MIN_TWEET_DATE and\ curr_dataset[0] in t.tokens and\ langid.classify(t.text)[0] == 'en'and\ sentiment(t.text)['compound'] != 0] if len(tweet_set) == 0: continue tweet = random.sample(tweet_set, 1)[0] print user.screen_name, curr_dataset[0:2], ":::: ", tweet.text curr_dataset[2].append(tweet) curr_dataset[1] -= 1 if curr_dataset[1] == 0: pickle.dump(curr_dataset[2],open(curr_dataset[0]+".p",'wb')) if len(datasets_to_collect) == 1: print 'DONE!!!' break datasets_to_collect = datasets_to_collect[1:] curr_dataset = datasets_to_collect[0]
def extract_langid(sentence): ensure_package_path() from langid import langid message = sentence['Sentence'] sentence['Language'] = langid.classify(message)[0] return sentence
def alias_by_str(str): return langid.classify(str)[0].upper()
def closed_pull_requests_summary(self): pulls_file = self.folder + '/pull_requests.json' pulls_summary_file = self.folder + '/pulls_closed.csv' if os.path.isfile( pulls_file) and not os.path.isfile(pulls_summary_file): with open(pulls_file, 'r') as pulls: data = json.load(pulls) with open(pulls_summary_file, 'a') as output: fieldnames = [ 'pull_request', 'number_of_commits', 'number_of_comments', 'number_of_reviews', 'user_type', 'user_login', 'closed_at', 'number_of_additions', 'number_of_deletions', 'number_of_files_changed', 'number_of_days', 'message', 'number_of_characters', 'second_line_is_blank', 'language' ] writer = csv.DictWriter(output, fieldnames=fieldnames) writer.writeheader() for pull_request in data: if pull_request['state'] == 'closed' and pull_request[ 'merged_at'] == None: try: number_of_commits = self.collector.commits_in_pull_request( pull_request['number']) number_of_comments = self.collector.comments_in_pull_request( pull_request['number']) number_of_reviews = self.collector.reviews_in_pull_request( pull_request['number']) pull_request_data = self.collector.pull_request( pull_request['number']) number_of_files_changed = None number_of_additions = None number_of_deletions = None message = '' if pull_request_data: if 'changed_files' in pull_request_data: number_of_files_changed = pull_request_data[ 'changed_files'] if 'additions' in pull_request_data: number_of_additions = pull_request_data[ 'additions'] if 'deletions' in pull_request_data: number_of_deletions = pull_request_data[ 'deletions'] if 'body' in pull_request_data: if pull_request_data['body'] != None: message = pull_request_data[ 'body'].encode('utf-8') number_of_characters = len(message) second_line_is_blank = False lines = message.split('\n') if len(lines) > 1: if not lines[1].strip(): second_line_is_blank = True language = langid.classify(message)[0] created_at = datetime.strptime( pull_request['created_at'], '%Y-%m-%dT%H:%M:%SZ') closed_at = datetime.strptime( pull_request['created_at'], '%Y-%m-%dT%H:%M:%SZ') number_of_days = (closed_at - created_at).days if pull_request['user']['site_admin'] == True: writer.writerow({ 'number_of_characters': number_of_characters, 'second_line_is_blank': second_line_is_blank, 'language': language, 'pull_request': pull_request['number'], 'number_of_commits': len(number_of_commits), 'number_of_comments': len(number_of_comments), 'number_of_reviews': len(number_of_reviews), 'user_type': 'Internals', 'user_login': pull_request['user']['login'], 'closed_at': closed_at, 'number_of_additions': number_of_additions, 'number_of_deletions': number_of_deletions, 'number_of_files_changed': number_of_files_changed, 'number_of_days': number_of_days, 'message': message }) else: writer.writerow({ 'number_of_characters': number_of_characters, 'second_line_is_blank': second_line_is_blank, 'language': language, 'pull_request': pull_request['number'], 'number_of_commits': len(number_of_commits), 'number_of_comments': len(number_of_comments), 'number_of_reviews': len(number_of_reviews), 'user_type': 'Externals', 'user_login': pull_request['user']['login'], 'closed_at': closed_at, 'number_of_additions': number_of_additions, 'number_of_deletions': number_of_deletions, 'number_of_files_changed': number_of_files_changed, 'number_of_days': number_of_days, 'message': message }) except Exception as ex: with open('error.log', 'a') as errors: errors.write(ex) errors.write('\n Repository:' + self.folder + '\n')
def preprocess(self, path_data_full, path_data_processed, data_range): all_quotes = [] words_hist = [] glove = GloVe('6B') all_tokens = list(glove.token_to_index.keys()) # Different punctuations punct_dict = { '.': '', '!': ' ', '?': ' ', '...': ' ', ',': ' ', ';': ' ', ':': ' ', '\u201D': ' ', '\u2019\u2019': ' ', ' \'': ' ', '\' ': ' ', '"': ' ', '--': ' ', '-': ' ', '\u201C': '', '\u2019': ' ', '\u2026': ' ', '(': ' ', ')': ' ', '[': ' ', ']': ' ', '{': ' ', '}': ' ', '\u2014': ' ', '+': ' ', '„': ' ', '[]': ' ', '()': ' ', '{}': ' ', '=': ' ', '♕': ' ', '@': ' ', '*': ' ', '&': ' and ', '#': ' ', '~': ' ', '\u2E2E': ' ', '\u2640': ' ', '\\': ' ', '/': ' ', '\u2665': ' ', '\u2764': ' ', '\u2018': ' ', '\u265B': ' ', '\u262F': ' ', '\u2013': ' ', '\uFF07': ' ', '\uFF07\uFF07': ' ', '\uFF40': ' ', '\u02CB': ' ', '\u0300': ' ', '%': ' %', '\u02BC': ' ', '\u02BC\u02BC': ' ', 'ღ': ' ', '\u2500': ' ', '\u202c': ' ', '\u0301': ' ', '\u202A': ' ', '<': ' ', '>': ' ', '❞': ' ', 'ε': ' ', '\u2637': ' ', '↺': ' ', '®': ' ', '$': ' ', '❣': ' ', '\u2015': ' ', '\u0313': ' ', '\u201B': ' ', '\u2032': ' ', '\u05F3': ' ', '\'': ' ', '`': ' ', '\u200E': ' ', ' ': ' ', ' ': ' ', ' ': ' ', } # Word combinations and "shortcuts" short_dict = { 'i\'m ': ' i am ', 'i\u0301m ': ' i am ', 'i\u2019m ': ' i am ', 'it\'s ': ' it is ', 'it\u2019s ': ' it is ', 'it´s ': ' it is ', '\u00B4ll': ' will ', 'won\u00b4t ': ' will not ', '\u00B4re ': ' are ', '\u00B4ve ': ' have ', 'i\u00B4m ': ' i am ', ' won\'t ': ' will not ', 'i\u0060m ': ' i am ', 'man\'s ': ' mans ', 'won\u2019t ': ' will not ', 'can\'t ': ' cannot ', '\'re ': ' are ', 'can\u0060t ': ' cannot ', '\u0060ve ': ' have ', 'won\u0060t ': ' will not ', 'n\u0060t ': ' not ', '\u02B9s ': ' is ', '\u0374s ': ' is ', '\u0374ve ': ' have ', '\u0374re ': ' are ', '\u02B9ve ': ' have ', '\u02B9re ': ' are ', '\'ve ': ' have ', '\'ll ': ' will ', '\u0060ll ': ' will ', '\'d ': ' would ', 'n\'t ': ' not ', '\'s ': ' is ', 'don\u2019t ': ' do not ', 'me\u2026 ': ' me ', '\u2019s ': ' is ', '\u2019re ': ' are ', '\u0060re ': ' are ', 'if\u2026 ': ' if ', 'day\u2026 ': ' day ', 'n\u2019t ': ' not ', '\u2019ll ': ' will ', '\u2019d ': ' would ', 'n´t ': ' not ', '\u0301re ': ' are ', '\u0301ve ': ' have ', '̵͇̿̿з ': ' ', '•̪ⓧ ': ' ', '̵͇̿̿ ': ' ', 'isno ': 'is no ', 'kissand ': 'kiss and', 'ryanlilly ': 'ryan lilly ', 'meand ': 'me and', 'whatlooks ': 'what looks', 'girlfriendcut ': 'girlfriend cut', 'worldyou ': ' world you ', 'heavenis ': ' heaven is ', 'worldso ': ' world so ', 'havebetter ': ' have better ', 'unknownand ': ' unknown and ', ' allof ': ' all of ', ' tolook ': ' to look ', ' notaffect ': ' not affect ', 'likea ': ' like a ', 'wantedas ': ' wanted as ', 'agonyof ': ' agony of ', 'skillthat ': ' skill that ', 'worldsall ': ' worlds all ', 'awaywhat ': ' away what ', 'outwhat ': ' out what ', 'savewhat ': ' save what ', 'educationso ': ' education so ', 'anyday ': ' any day ', 'usdo ': ' us do ', ' dependsupona ': ' depends upon a', ' wheelbarrowglazed ': ' wheelbarrow glazed ', 'waterbeside': 'water beside', ' whitechickens ': ' white chickens ', ' ain\'t ': ' aint ', } with open(path_data_full, encoding='utf8') as json_file: data = json.load(json_file) if data_range == 0: data_range = len(data) for i in range(data_range): quote = data[i].get('Quote') # Detect language - if not english, then skip over # langID check_ld = langid.classify(quote) if check_ld[0] != 'en': continue # Cleansing of quotes #quote = re.sub(r"(?<![A-Z])(?<!^)([A-Z])", r" \1", quote) quote = re.sub(r"\.(?!\s)(?!$)", r". ", quote) quote = quote.lower() quote = quote.strip('...') quote = quote.strip('"') quote = quote.strip() for diction in short_dict, punct_dict: for inst in diction: quote = quote.replace(inst, diction[inst]) # Length control - if longer than 10 words, then f**k off mate enable_length_control = True if enable_length_control: if len(quote.split()) > 20: continue # Check if word is in tokens, if not, drop sentence: all_tokens_avail = True for word in quote.split(): if not word in all_tokens: all_tokens_avail = False break if not all_tokens_avail: continue # Filter empty quotes out (need content to be used, obviously): if len(quote.split()) > 0: all_quotes.append(quote) for word in quote.split(): words_hist.append(word) self.total_word_count = len(words_hist) words_hist = Counter(words_hist) words_hist = words_hist.most_common(None) words, count = zip(*words_hist) self.word_count = dict(words_hist) vocabulary = dict(enumerate(words, 1)) self.end_token = len(vocabulary) + 1 vocabulary[f'{self.end_token}'] = self.end_token self.vocabulary = dict([(value, key) for key, value in vocabulary.items()]) all_quotes = set(all_quotes) self.all_quotes = list(all_quotes) values = [' ', '', " ", ""] for i in self.all_quotes: for value in values: if i == value: self.all_quotes.remove(value) break self.quote_count = len(self.all_quotes) json_file.close() # Create a list with all quotes processed_data = { 'Quotes': self.all_quotes, 'Vocabulary': self.vocabulary, 'Word count': self.word_count, 'Words total': self.total_word_count, 'Quotes total': self.quote_count, } # Write to file: with open(path_data_processed, 'w') as outfile: json.dump(processed_data, outfile) outfile.close() # Delete all variables that are not needed anymore to clear memory del data, quote, words_hist, words, count, all_quotes, vocabulary # Return all needed values: return self.all_quotes, self.vocabulary, self.word_count,\ self.total_word_count, self.end_token, self.quote_count,