def file_to_words(file): """ This tests that the file properly opens and returns a list of the words without spaces or new lines. """ wc = WordCount() return wc._file_reader(file)
class ExamplesTestCase(TestCase): def test_chain(self): a, b = FirstJob(), ChainJob() self.job = JobChain({a: ['raw://0'], b: a}) self.job.wait() self.assertResults(b, [(2, '')]) def test_chunk(self): self.tag = 'disco:test:examples:chunks' self.job = LineChunker().run(input=[chekhov], params=self.tag) self.assertEquals(len(list(self.results(self.job))), 1) def test_discodb(self): if self.settings['DISCO_TEST_DISCODB']: a, b = WordCountDDB(), Query() b.params = 'discover' self.job = JobChain({a: [chekhov], b: a}) self.job.wait() self.assertEquals(self.results(b).next()[1], ['2']) else: self.skipTest("DISCO_TEST_DISCODB not set") def test_grep(self): self.job = Grep().run(input=[chekhov], params='d.*?co') self.assertEquals(len(list(self.results(self.job))), 17) def test_wordcount(self): self.job = WordCount().run(input=[chekhov]) self.assertEquals(len(list(self.results(self.job))), 12339) def tearDown(self): super(ExamplesTestCase, self).tearDown() if hasattr(self, 'tag'): self.ddfs.delete(self.tag)
def total_word_count(list_of_words): wc = WordCount() manual_count = 0 for word in list_of_words: wc._add_to_count(word) manual_count += 1 assert wc.word_count_total() == manual_count assert wc.word_count_total() == wc._total_words
def adding_words(list_of_words): wc = WordCount() manual_count = 0 for word in list_of_words: wc._add_to_count(word) manual_count += 1 assert wc._word_counts[word] > 0 assert wc._total_words == manual_count
def __init__(self, language_code): self.language_codes = { 'en': 'english', 'es': 'spanish', 'fr': 'french' } self.language_code = language_code self.wordcount = WordCount(self.language_codes[language_code]) self.wordcount_dictionary = {}
def count_file(filename): """ This test only works for my numeric-based test files. """ wc = WordCount() wc.count_file(filename) assert wc._total_words == 10 assert wc._word_counts['zero'] == 0 assert wc._word_counts['one'] == 1 assert wc._word_counts['two'] == 2 assert wc._word_counts['three'] == 3 assert wc._word_counts['four'] == 4
def __init__(self, language_code, dictionary_path, dir_notes, dict_max_size=None): #wersja klasterujaca wszystkie pliki self.bag_of_words = {} self.language_code = language_code language_codes = {'en': 'english', 'es': 'spanish', 'fr': 'french'} self.wordcount = WordCount(language_codes[language_code]) self.word_indexes = self.load_dictionary(dictionary_path, dict_max_size) self.create(dir_notes)
def count_mult_files(list_of_filenames): wc = WordCount() mult = 1 for file in list_of_filenames: wc.count_file(file) assert wc._total_words == 10*mult assert wc._word_counts['zero'] == 0 assert wc._word_counts['one'] == 1*mult assert wc._word_counts['two'] == 2*mult assert wc._word_counts['three'] == 3*mult assert wc._word_counts['four'] == 4*mult mult += 1
def __init__(self, language_code, output_dir, dir_notes, dict_max_size=None ): #wersja klastrujaca notki z kazdego pliku z osobna self.bag_of_words = {} self.language_code = language_code language_codes = {'en': 'english', 'es': 'spanish', 'fr': 'french'} self.wordcount = WordCount(language_codes[language_code]) self.dict_max_size = dict_max_size self.create2(dir_notes, output_dir)
def _get_top_words(self, content, n): """Return top n links from content.""" left = [ m.start() for m in re.finditer(re.escape(self.LEFT_BRACKET), content) ] right = [ m.start() for m in re.finditer(re.escape(self.RIGHT_BRACKET), content) ] wc = WordCount() for i in range(0, len(left)): wc.add(content[left[i] + len(self.LEFT_BRACKET):right[i]]) return [key[0] for key in wc.top(n)]
def testWordCount(self): wc = WordCount() sc = wc.getSparkContext("WordCountTest", "local[*]") input = ["Apache Spark is a fast and general engine for large-scale data processing.", "Spark runs on both Windows and UNIX-like systems"] inputRDD = sc.parallelize(input) resultRDD = wc.process(inputRDD) resultMap = resultRDD.collectAsMap() self.assertEqual(resultMap['Spark'], 2) self.assertEqual(resultMap['UNIX-like'], 1) self.assertEqual(resultMap['runs'], 1) print(resultMap) sc.stop()
def test_discodb(self): if self.settings['DISCO_TEST_DISCODB']: a, b = WordCountDDB(), Query() b.params = 'discover' self.job = JobChain({a: [chekhov], b: a}) self.job.wait() self.assertEquals(self.results(b).next()[1], ['2']) else: self.skipTest("DISCO_TEST_DISCODB not set")
def findTags(file_path, pressnotes, cluster_number, language_code): language_codes = {'en': 'english', 'es': 'spanish', 'fr': 'french'} wordcount = WordCount(language_codes[language_code]) wordcount_dictionary = {} extras = {} for pressnote in pressnotes: wordcount.parse_text_extra(pressnote.title, wordcount_dictionary, extras) wordcount.parse_text_extra(pressnote.text, wordcount_dictionary, extras) sorted_wordcount = sorted(wordcount_dictionary.items(), key=operator.itemgetter(1), reverse=True) tags = [] for item in sorted_wordcount: #item[0] to stemming sorted_extras = sorted(extras[item[0]].items(), key=operator.itemgetter(1), reverse=True) for item in sorted_extras: tags.append(item[0]) break if len(tags) >= 10: break saveNotesToFile(file_path, pressnotes, cluster_number, tags)
def writers_words(list_of_files, outputfile): counter = WordCount() for file in list_of_files: counter.count_file(file) output = open(outputfile, 'w') stats = counter.word_stats() output.write("Total words counted: " + repr(counter.word_count_total()) \ + '\n') output.write('Rank'.rjust(RANK_IND) \ + 'Word'.rjust(WORD_IND) \ + 'Count'.rjust(COUNT_IND) \ + ' '*5 \ + 'Percentage'.ljust(PERC_IND) \ + '\n') rank = 1 for (count, perc, list_of_words) in stats: for word in list_of_words: newline = repr(rank).rjust(RANK_IND) \ + word.rjust(WORD_IND) \ + repr(count).rjust(COUNT_IND) \ + ' '*5 \ + repr(perc).ljust(PERC_IND) \ + '\n' output.write(newline) rank += 1 output.close() return True
def main(): print 'cleaning data.' data = pd.read_csv('../output/twitterDB_all.csv', header=None) # read data data.columns = ['tweet', 'city'] data_clean = data.dropna() # drop na print 'sentiment analysis.' data_clean.loc[:, 'senti_score'] = np.nan regex = '(\shttp[s]:\\\\)' data_clean.loc[:, 'tweet_content'] = data_clean.tweet \ .apply(lambda x: re.split(regex, x)[0]) regex2 = '\s@.+\:\s' data_clean.loc[:, 'tweet_content'] = data_clean.tweet_content \ .apply(lambda x: re.split(regex2, x)[-1]) # sentimental analysis data_clean.loc[:, 'senti_score'] = data_clean.tweet_content \ .apply(lambda x: SentiAnalyze(x)) data_city = data_clean[['city', 'senti_score', 'tweet_content']] data_city.reset_index(drop=True, inplace=True) # geocode the country name print 'convert city to country.' data_city.loc[:, 'country'] = np.nan city_names = data_clean.city.unique() city_country = {} print 'call google api' for city in city_names: city_country[city] = CountryToCity(city) print 'city country matching.' def f(x): if x in city_country.keys(): return city_country[x] else: return x data_city['country'] = data_city.city.apply(f) data_country = data_city[['country', 'senti_score', 'tweet_content']] print 'save the dataframe with sentimental score.' data_country.to_csv('../output/{0}.csv'.format(raw_input('File Name:\n'))) # word count print 'word count.' count = WordCount(data_country, 'country', 'tweet_content') print 'save the word count pickle file' filename = raw_input('WordCount Name:\n') with open('../output/{0}.pkl'.format(filename), 'w') as fh: pickle.dump(count, fh)
def proper_reset(num, list_of_words): wc1 = WordCount() wc2 = WordCount() # increase wc1 wc1._total_words += num for word in list_of_words: wc1._word_counts[word] += 1 # check that they are now different if num > 0: assert wc1._total_words > wc2._total_words if len(list_of_words) > 0: assert len(wc1._word_counts.items()) > len(wc2._word_counts.items()) #reset wc1.reset() # check that wc1 has indeed reset assert wc1._total_words == wc2._total_words assert wc1._total_words == 0 assert len(wc1._word_counts.items()) == len(wc2._word_counts.items()) assert len(wc1._word_counts.items()) == 0
def handle_data(data_list, shrunk_line_list, global_list): print('still going' + str(time.time())) sorted_counts = { k: v for k, v in sorted( data_list.items(), key=lambda item: item[1], reverse=True) } shrunk_list = {} for key in sorted_counts: if key in global_list: global_list[key].addCount(sorted_counts[key]) global_list[key].incrementPageCount() else: global_list[key] = WordCount(key, sorted_counts[key]) global_list = { k: v for k, v in sorted(global_list.items(), key=lambda item: item[1].count, reverse=True) } global_word_count_list = list(global_list) X = 5 in_top_X = False for x in range(min(X, len(global_list))): if key == global_word_count_list[x]: in_top_X = True if not (in_top_X): shrunk_list[key] = sorted_counts[key] shrunk_list = { k: v for k, v in sorted( shrunk_list.items(), key=lambda item: item[1], reverse=True) } shrunk_line_list.append(shrunk_list.copy()) retVal = list() retVal.append(shrunk_line_list) retVal.append(global_list) return retVal
def clean_words(list): wc = WordCount() for i in range(len(list)): list[i] = wc._word_cleaner(list[i]) return list
def test_grep(self): self.job = Grep().run(input=[chekhov], params='d.*?co') self.assertEquals(len(list(self.results(self.job))), 17)
def perc_words(filename): wc = WordCount() wc.count_file(filename) return wc.words_percent()
def words_stats(filename): wc = WordCount() wc.count_file(filename) return wc.word_stats()
def alpha_words(filename): wc = WordCount() wc.count_file(filename) return wc.words_alphabetical()
def ranked_words(filename): wc = WordCount() wc.count_file(filename) return wc.words_ranked()
def test_chain(self): a, b = FirstJob(), ChainJob() self.job = JobChain({a: ['raw://0'], b: a}) self.job.wait() self.assertResults(b, [(2, '')])
def unique_word_count(list_of_words): wc = WordCount() for word in list_of_words: wc._add_to_count(word) return wc.word_count_unique()
def test_chunk(self): self.tag = 'disco:test:examples:chunks' self.job = LineChunker().run(input=[chekhov], params=self.tag) self.assertEquals(len(list(self.results(self.job))), 1)
from datetime import datetime import csv from functools import reduce from wordcount import WordCount kms = boto3.client('kms') dynamodb = boto3.client('dynamodb') logs = boto3.client('logs') if 'SECRETS' in os.environ: SECRETS = json.loads(kms.decrypt( CiphertextBlob=base64.b64decode(os.environ['SECRETS']) )['Plaintext'].decode("utf-8")) wc = WordCount() def update_counter(word, book, n=1): response = dynamodb.update_item( TableName='words', Key={ 'Book': {'S': book}, 'BookWord': {'S': word} }, UpdateExpression='SET wordCount = if_not_exists(wordCount, :init) + :inc', ExpressionAttributeValues={ ':inc': {'N': str(n)}, ':init': {'N': '0'}, }, ReturnValues="UPDATED_NEW"
def test_wordcount(self): self.job = WordCount().run(input=[chekhov]) self.assertEquals(len(list(self.results(self.job))), 12339)
class BagOfWords: def __init__(self, language_code, dictionary_path, dir_notes, dict_max_size=None): #wersja klasterujaca wszystkie pliki self.bag_of_words = {} self.language_code = language_code language_codes = {'en': 'english', 'es': 'spanish', 'fr': 'french'} self.wordcount = WordCount(language_codes[language_code]) self.word_indexes = self.load_dictionary(dictionary_path, dict_max_size) self.create(dir_notes) def __init__(self, language_code, output_dir, dir_notes, dict_max_size=None ): #wersja klastrujaca notki z kazdego pliku z osobna self.bag_of_words = {} self.language_code = language_code language_codes = {'en': 'english', 'es': 'spanish', 'fr': 'french'} self.wordcount = WordCount(language_codes[language_code]) self.dict_max_size = dict_max_size self.create2(dir_notes, output_dir) @staticmethod def load_dictionary(dictionary_path, dict_max_size=None): #dict_max_size=None -> no limit dict_indexes = {} index = 0 with open(dictionary_path, 'r') as f: for line in f: dict_indexes[line.split(' ')[0]] = index index += 1 if (dict_max_size is not None and index >= dict_max_size): break return dict_indexes #parse all languages with english version def create(self, dir_notes): pressnote_list = [] for root, subFolders, files in os.walk(dir_notes): for file in files: root_wanted = root.split(os.sep)[-1].startswith( self.language_code) #there is translation in rss2.csv if (root_wanted and file == 'rss_unique.csv') or ( not root_wanted and file == 'rss_en.csv'): print os.path.join(root, file) pressnote_list.extend( PressNote.load_list(os.path.join(root, file))) for pressnote in pressnote_list: note_dictionary = {} self.wordcount.parse_text(pressnote.title, note_dictionary) self.wordcount.parse_text(pressnote.text, note_dictionary) word_vector = [0] * len(self.word_indexes) for key in note_dictionary: if key in self.word_indexes: idx = self.word_indexes[key] word_vector[idx] = note_dictionary[key] self.bag_of_words[pressnote] = word_vector print "Created bag of words: " + str(len( self.bag_of_words)) + " x " + str( len(self.bag_of_words[pressnote_list[0]])) + "\n" #parse all files def create2(self, dir_notes, output_dir): pressnote_list = [] for root, subFolders, files in os.walk(dir_notes): for file in files: self.bag_of_words = {} root_wanted = root.split(os.sep)[-1].startswith( self.language_code) #there is translation in rss2.csv pattern1 = re.compile(r'^rss_unique(\d*)\.csv$') pattern2 = re.compile(r'^rss_en(\d*)\.csv$') if (root_wanted and pattern1.match(file)) or ( not root_wanted and pattern2.match(file)): pressnote_list = PressNote.load_list( os.path.join(root, file)) dictionary_maker = DictionaryMaker(self.language_code) dictionary_maker.parse_language2(os.path.join(root, file)) dictionary_maker.dump(output_dir + os.sep + 'temp_dictionary.txt') self.word_indexes = self.load_dictionary( output_dir + os.sep + 'temp_dictionary.txt', self.dict_max_size) for pressnote in pressnote_list: note_dictionary = {} self.wordcount.parse_text(pressnote.title, note_dictionary) self.wordcount.parse_text(pressnote.text, note_dictionary) word_vector = [0] * len(self.word_indexes) for key in note_dictionary: if key in self.word_indexes: idx = self.word_indexes[key] word_vector[idx] = note_dictionary[key] self.bag_of_words[pressnote] = word_vector print "Created bag of words: " + str(len( self.bag_of_words)) + " x " + str( len(self.bag_of_words[pressnote_list[0]])) + "\n" match_file = re.match(r'(rss_unique|rss_en)(\d*)\.csv', file) number = match_file.group(2) if number is None: number = "" self.cluster(output_dir + os.sep + root.split(os.sep)[-2] + os.sep + 'cluster_' + root.split(os.sep)[-1] + number + '.txt') def cluster(self, clusters_file_path): X = [] Y = {} for key in self.bag_of_words: X.append(self.bag_of_words[key]) # pca = PCA(n_components=min(len(X[0]), 5000)) # pca.fit(X) # X = pca.transform(X) ## pca = TruncatedSVD(n_components=100) ## X = pca.fit_transform(X) # print "PCA - done" # print "Truncated bag of words of size: " + str(len(X)) + " x " + str(len(X[0])) + "\n" ward = AgglomerativeClustering(n_clusters=max(len(X) / 30, 50), linkage='ward').fit(X) print "Clusters created: " + str(max(len(X) / 30, 50)) idx = 0 for key in self.bag_of_words: Y[key] = ward.labels_[idx] idx += 1 sorted_clusters = sorted(Y.items(), key=operator.itemgetter(1), reverse=False) cluster_number = 0 with open(clusters_file_path, 'w') as f: f.write(str(cluster_number) + "\n") for cluster in sorted_clusters: if cluster_number != cluster[1]: cluster_number = cluster[1] f.write("\n" + str(cluster_number) + "\n") f.write(str(cluster[0])) print "Clusters saved: " + clusters_file_path + "\n"
class DictionaryMaker: def __init__(self, language_code): self.language_codes = { 'en': 'english', 'es': 'spanish', 'fr': 'french' } self.language_code = language_code self.wordcount = WordCount(self.language_codes[language_code]) self.wordcount_dictionary = {} #parse only concrete language def parse_language(self, directory, max_parsed_pressnotes=None ): #max_parsed_pressnotes=None -> no limit n = 0 for root, subFolders, files in os.walk(directory): for file in files: if root.split(os.sep)[-1].startswith( self.language_code) and file == 'rss_unique.csv': print os.path.join(root, file) pressnote_list = PressNote.load_list( os.path.join(root, file)) for pressnote in pressnote_list: self.wordcount.parse_text(pressnote.title, self.wordcount_dictionary) self.wordcount.parse_text(pressnote.text, self.wordcount_dictionary) n += 1 if max_parsed_pressnotes is not None and n > max_parsed_pressnotes: break print "Parsed: " + str(n) + " press notes" #parse only concrete file def parse_language2(self, file_path, max_parsed_pressnotes=None ): #max_parsed_pressnotes=None -> no limit n = 0 print file_path pressnote_list = PressNote.load_list(file_path) for pressnote in pressnote_list: self.wordcount.parse_text(pressnote.title, self.wordcount_dictionary) self.wordcount.parse_text(pressnote.text, self.wordcount_dictionary) n += 1 if max_parsed_pressnotes is not None and n > max_parsed_pressnotes: break print "Parsed: " + str(n) + " press notes" #parse all languages with english version def parse(self, directory, max_parsed_pressnotes=None ): #max_parsed_pressnotes=None -> no limit n = 0 for root, subFolders, files in os.walk(directory): for file in files: root_wanted = root.split(os.sep)[-1].startswith( self.language_code) #there is translation in rss2.csv if (root_wanted and file == 'rss_unique.csv') or ( not root_wanted and file == 'rss_en.csv'): print os.path.join(root, file) pressnote_list = PressNote.load_list( os.path.join(root, file)) for pressnote in pressnote_list: self.wordcount.parse_text(pressnote.title, self.wordcount_dictionary) self.wordcount.parse_text(pressnote.text, self.wordcount_dictionary) n += 1 if max_parsed_pressnotes is not None and n > max_parsed_pressnotes: break print "Parsed: " + str(n) + " press notes" def dump(self, dictionary_name, dict_max_size=None): #dict_max_size=None -> no limit sorted_wordcount = sorted(self.wordcount_dictionary.items(), key=operator.itemgetter(1), reverse=True) if (dict_max_size is not None): sorted_wordcount = sorted_wordcount[:dict_max_size] with open(dictionary_name, 'w') as f: keys = [item[0] + " " + str(item[1]) for item in sorted_wordcount] f.write('\n'.join(keys))