Esempio n. 1
0
 def frequent_word_statistics(self):
     try:
         word_collection = []
         word_frequency_collection = []
         page_types = ["index", "login", "register"]
         for pageType in page_types:
             file = open("categories/" + pageType + "/url.txt")
             urls = file.read().split("\n")
             for url in urls:
                 for word in self.fetch_words(url, 2, 6):
                     word_collection = self.collect(word_collection, word)
                     if word_frequency_collection.__contains__(
                             WordFrequency(word)):
                         for word_frequency in word_frequency_collection:
                             if word_frequency.word == word:
                                 word_frequency.frequency += 1
                                 break
                     else:
                         word_frequency_collection.append(
                             WordFrequency(word))
         word_file = open("categories/words.txt", 'w', encoding='utf-8')
         vector_file = open("categories/all/vector.txt",
                            'w',
                            encoding='utf-8')
         word_frequency_collection.sort(key=lambda wf: wf.frequency)
         word_frequency_collection.reverse()
         for word_frequency in word_frequency_collection:
             print(word_frequency.word + " " +
                   str(word_frequency.frequency))
             word_file.write(word_frequency.word + "\n")
             vector_file.write(str(word_frequency.frequency) + "\n")
     except FileNotFoundError as e:
         print(e)
         sys.exit(1)
Esempio n. 2
0
 def calc_frequencies(self):
     for word in self.filtered_words:
         frequency = self.get_word_frequency(word)
         if frequency is None:
             frequency = WordFrequency(word, self.words_total)
             self.word_frequencies.append(frequency)
         else:
             frequency.add()
 def calc_texts_frequencies(self):
     for text_number in range(self.texts_total):
         words = self.words_matrix[text_number]
         for word in words:
             text_frequency = self.get_word_frequency(word, text_number)
             if text_frequency is None:
                 text_frequency = WordFrequency(word, len(word))
                 self.word_frequencies_matrix[text_number].append(
                     text_frequency)
             else:
                 text_frequency.add()
from WordFrequency import WordFrequency
import sys
import json
import operator
from collections import OrderedDict
from collections import Counter


if __name__ == '__main__':
    # Creates an instance of our MRJob subclass
    job=WordFrequency(args=sys.argv[1:])
    with job.make_runner() as runner:
        # Run the job
        runner.run()

        
        

        #print json.dumps(data)    
        # Process the output
        f = open("result.txt", "w")
        data=OrderedDict()
        for line in runner.stream_output():
            key, value = job.parse_output_line(line)
            print 'key:', key, 'value:', value
            f.write(str(str(key)+" "+str(value)))
            f.write("\n")
        f.close()
Esempio n. 5
0
def edits1(word):
    splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
    deletes = [a + b[1:] for a, b in splits if b]
    transposes = [a + b[1] + b[0] + b[2:] for a, b in splits if len(b) > 1]
    replaces = [a + c + b[1:] for a, b in splits for c in alphabet if b]
    inserts = [a + c + b for a, b in splits for c in alphabet]
    return set(deletes + transposes + replaces + inserts)


def datafile(name, sep='\t'):
    """Read key,value pairs from file."""
    for line in file(name):
        yield line.split(sep)


edit_frequencies = WordFrequency.from_freq_file('Norvig/edits/count_1edit.txt')


p_spell_error = 1./20.


def p_edit(edit):
    """The probability of an edit; can be '' or 'a|b' """
    if edit == '':
        return 1. - p_spell_error
    return p_spell_error * edit_frequencies.get_probability(edit)

PREFIXES = set(w[:i] for w in Pw for i in range(len(w) + 1))


def edits(word, dictionary, d=2):
Esempio n. 6
0
from Context import Context
from EditDistance import edits
from WordFrequency import WordFrequency
import re
import Readers

count_1w = WordFrequency.from_freq_file("data/Norvig/wordfreqs/count_1w.txt")
count_2w = WordFrequency.from_freq_file("data/Norvig/wordfreqs/count_2w.txt")



def corrections(text):
    "Spell-correct all words in text."
    return re.sub('[a-zA-Z]+', lambda m: correct(m.group(0)), text)

def correct(context):
    "Return the word that is the most likely spell correction of w."
    candidates = edits(context.word()).items()

    #c, edit = max(candidates, key=lambda (c,e): Pedit(e) * Pw(c))
    #return c


for context in Context.gen_context_sequence_from_word_sequence(Readers.gen_words_from_file("doc.txt")):
    correct(context)