Ejemplo n.º 1
0
def findTags(file_path, pressnotes, cluster_number, language_code):
    language_codes = {'en': 'english', 'es': 'spanish', 'fr': 'french'}
    wordcount = WordCount(language_codes[language_code])
    wordcount_dictionary = {}
    extras = {}

    for pressnote in pressnotes:
        wordcount.parse_text_extra(pressnote.title, wordcount_dictionary,
                                   extras)
        wordcount.parse_text_extra(pressnote.text, wordcount_dictionary,
                                   extras)

    sorted_wordcount = sorted(wordcount_dictionary.items(),
                              key=operator.itemgetter(1),
                              reverse=True)
    tags = []
    for item in sorted_wordcount:  #item[0] to stemming
        sorted_extras = sorted(extras[item[0]].items(),
                               key=operator.itemgetter(1),
                               reverse=True)
        for item in sorted_extras:
            tags.append(item[0])
            break
        if len(tags) >= 10:
            break
    saveNotesToFile(file_path, pressnotes, cluster_number, tags)
Ejemplo n.º 2
0
def file_to_words(file):
    """
    This tests that the file properly opens and returns a list of the words
    without spaces or new lines.
    """
    wc = WordCount()
    return wc._file_reader(file)
Ejemplo n.º 3
0
def writers_words(list_of_files, outputfile):

    counter = WordCount()
    for file in list_of_files:
        counter.count_file(file)
    output = open(outputfile, 'w')

    stats = counter.word_stats()

    output.write("Total words counted: " + repr(counter.word_count_total()) \
        + '\n')
    output.write('Rank'.rjust(RANK_IND) \
                + 'Word'.rjust(WORD_IND) \
                + 'Count'.rjust(COUNT_IND) \
                + ' '*5 \
                + 'Percentage'.ljust(PERC_IND) \
                + '\n')

    rank = 1

    for (count, perc, list_of_words) in stats:
        for word in list_of_words:
            newline = repr(rank).rjust(RANK_IND) \
                + word.rjust(WORD_IND) \
                + repr(count).rjust(COUNT_IND) \
                + ' '*5 \
                + repr(perc).ljust(PERC_IND) \
                + '\n'
            output.write(newline)
        rank += 1

    output.close()

    return True
Ejemplo n.º 4
0
def adding_words(list_of_words):
    wc = WordCount()
    manual_count = 0
    for word in list_of_words:
        wc._add_to_count(word)
        manual_count += 1
        assert wc._word_counts[word] > 0
        assert wc._total_words == manual_count
Ejemplo n.º 5
0
def total_word_count(list_of_words):
    wc = WordCount()
    manual_count = 0
    for word in list_of_words:
        wc._add_to_count(word)
        manual_count += 1
        assert wc.word_count_total() == manual_count
        assert wc.word_count_total() == wc._total_words
 def __init__(self, language_code):
     self.language_codes = {
         'en': 'english',
         'es': 'spanish',
         'fr': 'french'
     }
     self.language_code = language_code
     self.wordcount = WordCount(self.language_codes[language_code])
     self.wordcount_dictionary = {}
Ejemplo n.º 7
0
def count_file(filename):
    """ This test only works for my numeric-based test files. """
    wc = WordCount()
    wc.count_file(filename)
    assert wc._total_words == 10
    assert wc._word_counts['zero'] == 0
    assert wc._word_counts['one'] == 1
    assert wc._word_counts['two'] == 2
    assert wc._word_counts['three'] == 3
    assert wc._word_counts['four'] == 4
Ejemplo n.º 8
0
 def __init__(self,
              language_code,
              dictionary_path,
              dir_notes,
              dict_max_size=None):  #wersja klasterujaca wszystkie pliki
     self.bag_of_words = {}
     self.language_code = language_code
     language_codes = {'en': 'english', 'es': 'spanish', 'fr': 'french'}
     self.wordcount = WordCount(language_codes[language_code])
     self.word_indexes = self.load_dictionary(dictionary_path,
                                              dict_max_size)
     self.create(dir_notes)
Ejemplo n.º 9
0
 def __init__(self,
              language_code,
              output_dir,
              dir_notes,
              dict_max_size=None
              ):  #wersja klastrujaca notki z kazdego pliku z osobna
     self.bag_of_words = {}
     self.language_code = language_code
     language_codes = {'en': 'english', 'es': 'spanish', 'fr': 'french'}
     self.wordcount = WordCount(language_codes[language_code])
     self.dict_max_size = dict_max_size
     self.create2(dir_notes, output_dir)
Ejemplo n.º 10
0
def count_mult_files(list_of_filenames):
    wc = WordCount()
    mult = 1
    for file in list_of_filenames:
        wc.count_file(file)
        assert wc._total_words == 10*mult
        assert wc._word_counts['zero'] == 0
        assert wc._word_counts['one'] == 1*mult
        assert wc._word_counts['two'] == 2*mult
        assert wc._word_counts['three'] == 3*mult
        assert wc._word_counts['four'] == 4*mult
        mult += 1
Ejemplo n.º 11
0
def main():
    print 'cleaning data.'
    data = pd.read_csv('../output/twitterDB_all.csv', header=None)  # read data
    data.columns = ['tweet', 'city']
    data_clean = data.dropna()  # drop na
    print 'sentiment analysis.'
    data_clean.loc[:, 'senti_score'] = np.nan
    regex = '(\shttp[s]:\\\\)'
    data_clean.loc[:, 'tweet_content'] = data_clean.tweet \
                                                   .apply(lambda x:
                                                          re.split(regex,
                                                                   x)[0])
    regex2 = '\s@.+\:\s'
    data_clean.loc[:, 'tweet_content'] = data_clean.tweet_content \
                                                   .apply(lambda x:
                                                          re.split(regex2,
                                                                   x)[-1])
    # sentimental analysis
    data_clean.loc[:, 'senti_score'] = data_clean.tweet_content \
                                                 .apply(lambda x:
                                                        SentiAnalyze(x))
    data_city = data_clean[['city', 'senti_score', 'tweet_content']]
    data_city.reset_index(drop=True, inplace=True)
    # geocode the country name
    print 'convert city to country.'
    data_city.loc[:, 'country'] = np.nan
    city_names = data_clean.city.unique()
    city_country = {}
    print 'call google api'
    for city in city_names:
        city_country[city] = CountryToCity(city)
    print 'city country matching.'

    def f(x):
        if x in city_country.keys():
            return city_country[x]
        else:
            return x

    data_city['country'] = data_city.city.apply(f)
    data_country = data_city[['country', 'senti_score', 'tweet_content']]
    print 'save the dataframe with sentimental score.'
    data_country.to_csv('../output/{0}.csv'.format(raw_input('File Name:\n')))
    # word count
    print 'word count.'
    count = WordCount(data_country, 'country', 'tweet_content')
    print 'save the word count pickle file'
    filename = raw_input('WordCount Name:\n')
    with open('../output/{0}.pkl'.format(filename), 'w') as fh:
        pickle.dump(count, fh)
Ejemplo n.º 12
0
def proper_reset(num, list_of_words):
    wc1 = WordCount()
    wc2 = WordCount()

    # increase wc1
    wc1._total_words += num
    for word in list_of_words:
        wc1._word_counts[word] += 1

    # check that they are now different
    if num > 0:
        assert wc1._total_words > wc2._total_words
    if len(list_of_words) > 0:
        assert len(wc1._word_counts.items()) > len(wc2._word_counts.items())

    #reset
    wc1.reset()

    # check that wc1 has indeed reset
    assert wc1._total_words == wc2._total_words
    assert wc1._total_words == 0
    assert len(wc1._word_counts.items()) == len(wc2._word_counts.items())
    assert len(wc1._word_counts.items()) == 0
Ejemplo n.º 13
0
 def testWordCount(self):
     wc = WordCount()
     sc = wc.getSparkContext("WordCountTest", "local[*]")
     input = ["Apache Spark is a fast and general engine for large-scale data processing.", "Spark runs on both Windows and UNIX-like systems"]
     inputRDD = sc.parallelize(input)
     resultRDD = wc.process(inputRDD)
     resultMap = resultRDD.collectAsMap()
     
     self.assertEqual(resultMap['Spark'], 2)
     self.assertEqual(resultMap['UNIX-like'], 1)
     self.assertEqual(resultMap['runs'], 1)
     
     print(resultMap)
     
     sc.stop()
Ejemplo n.º 14
0
    def _get_top_words(self, content, n):
        """Return top n links from content."""
        left = [
            m.start()
            for m in re.finditer(re.escape(self.LEFT_BRACKET), content)
        ]
        right = [
            m.start()
            for m in re.finditer(re.escape(self.RIGHT_BRACKET), content)
        ]

        wc = WordCount()
        for i in range(0, len(left)):
            wc.add(content[left[i] + len(self.LEFT_BRACKET):right[i]])
        return [key[0] for key in wc.top(n)]
Ejemplo n.º 15
0
def handle_data(data_list, shrunk_line_list, global_list):
    print('still going' + str(time.time()))
    sorted_counts = {
        k: v
        for k, v in sorted(
            data_list.items(), key=lambda item: item[1], reverse=True)
    }
    shrunk_list = {}
    for key in sorted_counts:
        if key in global_list:
            global_list[key].addCount(sorted_counts[key])
            global_list[key].incrementPageCount()
        else:
            global_list[key] = WordCount(key, sorted_counts[key])
        global_list = {
            k: v
            for k, v in sorted(global_list.items(),
                               key=lambda item: item[1].count,
                               reverse=True)
        }
        global_word_count_list = list(global_list)
        X = 5
        in_top_X = False
        for x in range(min(X, len(global_list))):
            if key == global_word_count_list[x]:
                in_top_X = True
        if not (in_top_X):
            shrunk_list[key] = sorted_counts[key]
    shrunk_list = {
        k: v
        for k, v in sorted(
            shrunk_list.items(), key=lambda item: item[1], reverse=True)
    }
    shrunk_line_list.append(shrunk_list.copy())
    retVal = list()
    retVal.append(shrunk_line_list)
    retVal.append(global_list)
    return retVal
Ejemplo n.º 16
0
from datetime import datetime
import csv
from functools import reduce

from wordcount import WordCount

kms = boto3.client('kms')
dynamodb = boto3.client('dynamodb')
logs = boto3.client('logs')

if 'SECRETS' in os.environ:
    SECRETS = json.loads(kms.decrypt(
        CiphertextBlob=base64.b64decode(os.environ['SECRETS'])
    )['Plaintext'].decode("utf-8"))

wc = WordCount()


def update_counter(word, book, n=1):
    response = dynamodb.update_item(
        TableName='words',
        Key={
            'Book': {'S': book},
            'BookWord': {'S': word}
        },
        UpdateExpression='SET wordCount = if_not_exists(wordCount, :init) + :inc',
        ExpressionAttributeValues={
            ':inc': {'N': str(n)},
            ':init': {'N': '0'},
        },
        ReturnValues="UPDATED_NEW"
Ejemplo n.º 17
0
def clean_words(list):
    wc = WordCount()
    for i in range(len(list)):
        list[i] = wc._word_cleaner(list[i])
    return list
Ejemplo n.º 18
0
def words_stats(filename):
    wc = WordCount()
    wc.count_file(filename)
    return wc.word_stats()
Ejemplo n.º 19
0
def perc_words(filename):
    wc = WordCount()
    wc.count_file(filename)
    return wc.words_percent()
Ejemplo n.º 20
0
def ranked_words(filename):
    wc = WordCount()
    wc.count_file(filename)
    return wc.words_ranked()
Ejemplo n.º 21
0
def alpha_words(filename):
    wc = WordCount()
    wc.count_file(filename)
    return wc.words_alphabetical()
Ejemplo n.º 22
0
def unique_word_count(list_of_words):
    wc = WordCount()
    for word in list_of_words:
        wc._add_to_count(word)
    return wc.word_count_unique()
 def test_wordcount(self):
     self.job = WordCount().run(input=[chekhov])
     self.assertEquals(len(list(self.results(self.job))), 12339)