def compute_p_word_given_class(data_paths, vocab_size):
    Return a dictionary of word probabilities, P(word | class). All datapaths belong to the same class.
    Incorporate Laplacian Smoothing with k=1 here. p_word_given_class should include the probability of UNKNOWN_WORD, 
        any word that doesn't appear in the training set
    p_word_given_class = dict()
    # compute number of words in the given class
    class_size = 0
    for path in data_paths:
        message = open_file(path)
        words = get_words(message)
        class_size += len(words)

    # add elements to dictionary
    for path in data_paths:
        message = open_file(path)
        words = get_words(message)
        for word in words:
            if word in p_word_given_class:
                p_word_given_class[word] += 1 / (class_size + vocab_size + 1)
                p_word_given_class[word] = 2 / (class_size + vocab_size + 1)
    p_word_given_class['UNKNOWN_WORD'] = 1 / (class_size + vocab_size + 1)
    return p_word_given_class
Exemple #2
    def __get_equals_words_atr(self, summary_sentence, document_sentence):
        ''' Return the proportions of equals words '''
        words_summary = utils.get_words(summary_sentence, stop_words=True)
        words_document = utils.get_words(document_sentence, stop_words=True)
        total_words = len(words_summary)
        equals_words = 0
        tmp_dict = dict()

        for word_summary in words_summary:
            for word_document in words_document:
                if word_summary == word_document and word_summary not in tmp_dict:
                    tmp_dict[word_summary] = 1
                    equals_words += 1

        return equals_words / total_words
Exemple #3
    def analyze_text(self, text):
        words = get_words(text)
        char_count = get_char_count(words)
        word_count = len(words)
        sentence_count = len(get_sentences(text))
        syllable_count = count_syllables(words)
        complexwords_count = count_complex_words(text)
        avg_words_p_sentence = word_count / sentence_count

        self.analyzedVars = {
            'words': words,
            'char_cnt': float(char_count),
            'word_cnt': float(word_count),
            'sentence_cnt': float(sentence_count),
            'syllable_cnt': float(syllable_count),
            'complex_word_cnt': float(complexwords_count),
            'avg_words_p_sentence': float(avg_words_p_sentence)

        outData = {
            'char_cnt': float(char_count),
            'word_cnt': float(word_count),
            'sentence_cnt': float(sentence_count),
            'syllable_cnt': float(syllable_count),
            'complex_word_cnt': float(complexwords_count),
            'avg_words_p_sentence': float(avg_words_p_sentence)
        return outData
Exemple #4
def compute(test_id='test'):
    pic_path = 'static/' + test_id + '.png'
    truth_path = 'static/' + test_id + '.txt'
    base_image, pic_with_box, word_pics = get_words(pic_path)
    word_pics = np.stack(word_pics)
    pred = F.softmax(net_recog(torch.from_numpy(word_pics)), dim=1)
    _, idxes = torch.max(pred, dim=1)

    res = []
    for idx in idxes:

    f = open(truth_path, 'r')
    lines = f.readlines()
    lines = [l.strip() for l in lines]
    truth = []
    for l in lines:
        truth += l.split()

    for i, w in enumerate(res):
        if w not in truth:
            res[i] = '<span style="color:red">' + res[i] + '</span>'
    truth = ' '.join(truth)
    res = ' '.join(res[::-1])
    score = difflib.SequenceMatcher(a=truth, b=res).ratio()

    return img2str(base_image), img2str(pic_with_box), res, truth, score
Exemple #5
 def p_word_given_class(self, data_files, vocab_size):
     helper function
     return dictionary representation of P(word | class)
     word_counter = Counter()
     for file in data_files:
         words = get_words(open_file(file))
         for i, word in enumerate(words):
             if unigrams:
                 if word not in self.vocab:
                 word_counter[word] += 1
             if bigrams:
                 if i != 0:
                     bigram = words[i - 1] + ' ' + word
                     if bigram not in self.vocab:
                     word_counter[bigram] += 1
     word_counter["UNK"] = 0
     total_count = sum(word_counter.values())
     for word in word_counter:
         word_counter[word] = (word_counter[word] +
                               self.k) / (total_count + vocab_size * self.k)
     p_word_given_class = dict(word_counter)
     return p_word_given_class
Exemple #6
def construct_libsvm_line(line):

    global target_index, value_indexes, headers, indexes2binarize, indexes2tokenize, indexes2filter

    label = target_index

    new_line = []
    for i in sorted(indexes2binarize + indexes2tokenize):

        col_name = headers[i]

        if i in indexes2binarize:
            value = line[i]
                value_index = value_indexes[i][value]
            except KeyError:

            new_item = "%s:1" % (value_index)
            text = line[i]
            words = get_words(text)

            # word_indexes = map( lambda x: value_indexes[i][x], words )
            word_indexes = get_word_indexes(words, i)
            for word_index in sorted(word_indexes):
                new_item = "%s:1" % (word_index)

    new_line.insert(0, label)
    new_line = " ".join(new_line)
    return new_line
Exemple #7
 def p_class_given_input(self, input_path, p_word_given_class, p_class):
     helper function
     return P(class | input)
     p_class_given_input = 0
     words = get_words(open_file(input_path))
     if bigrams:
         sample_bigrams = set()
         for i, word in enumerate(words):
             if i != 0:
                 sample_bigrams.add(words[i - 1] + ' ' + word)
         for bigram in sample_bigrams:
             if bigram not in self.vocab:
             if bigram not in p_word_given_class:
                 bigram = "UNK"
             p = p_word_given_class[bigram]
             if p > 0:
                 p_class_given_input += math.log(p_word_given_class[bigram])
     if unigrams:
         for word in set(words):
             if word not in self.vocab:
             if word not in p_word_given_class:
                 word = "UNK"
             p = p_word_given_class[word]
             if p > 0:
                 p_class_given_input += math.log(p_word_given_class[word])
     p_class_given_input += math.log(p_class)
     return p_class_given_input
def construct_libsvm_line( line ):

	global target_index, value_indexes, headers, indexes2binarize, indexes2tokenize, indexes2filter

	label = target_index
	new_line = []
	for i in sorted( indexes2binarize + indexes2tokenize ):
		col_name = headers[i]
		if i in indexes2binarize:
			value = line[i]	
				value_index = value_indexes[i][value]
			except KeyError:
			new_item = "%s:1" % ( value_index )
			new_line.append( new_item )
			text = line[i]
			words = get_words( text )
			# word_indexes = map( lambda x: value_indexes[i][x], words )
			word_indexes = get_word_indexes( words, i )
			for word_index in sorted( word_indexes ):
				new_item = "%s:1" % ( word_index )
				new_line.append( new_item )		
	new_line.insert( 0, label )
	new_line = " ".join( new_line )
	return new_line
Exemple #9
def compute_p_word_given_class(data_paths, vocab_size):
    Return a dictionary of word probabilities, P(word | class). All datapaths belong to the same class.
    Incorporate Laplacian Smoothing with k=1 here. p_word_given_class should include the probability of UNKNOWN_WORD, 
        any word that doesn't appear in the training set

    # Implement your solution here

    count_words = {}
    total_num_words = 0

    for path in data_paths:
        words = get_words(open_file(path))
        total_num_words += len(words)
        for word in words:
            if word in count_words:
                count_words[word] += 1
                count_words[word] = 1

    p_word_given_class = {}

    for word in count_words:
        prob = (count_words[word] + 1) / (total_num_words + vocab_size + 1)
        p_word_given_class[word] = prob
        #print(word + " ", end="")

    p_word_given_class["UNKNOWN_WORD"] = 1 / (total_num_words + vocab_size + 1)

    return p_word_given_class
Exemple #10
def get_cut_points(
    time_dict: Dict[datetime.datetime, List[str]],
    up_ratio: float = 2,
    down_ratio: float = 0.75,
    topK: int = 5
) -> List[Tuple[datetime.datetime, datetime.datetime, List[str]]]:
    status = 0
    cut_points = []
    prev_num = None
    start_time = None
    temp_texts = []
    for time, texts in time_dict.items():
        if prev_num is None:
            start_time = time
            temp_texts = copy.copy(texts)
        elif status == 0 and len(texts) >= prev_num * up_ratio:
            status = 1
        elif status == 1 and len(texts) < prev_num * down_ratio:
            tags = utils.get_words("。".join(texts), topK=topK)
            cut_points.append((start_time, time, tags))
            status = 0
            start_time = time
            temp_texts = copy.copy(texts)
        elif status == 0:
            start_time = time
            temp_texts = copy.copy(texts)
        prev_num = len(texts)
    return cut_points
    def analyze_text(self, text):
        words = get_words(text)
        char_count = int(get_char_count(words))
        word_count = int(len(words))
        sentences = get_sentences(text)
        len_sentences = len(sentences)
        sentence_count = int(len_sentences)
        # sentence_count = int(len(get_sentences(text)))
        syllable_count = count_syllables(words)
        complexwords_count = count_complex_words(text.decode('utf-8'))
        avg_words_p_sentence = word_count / sentence_count
        encoding_dict = detect_encoding(self.filename)

        self.analyzedVars = {
            'filename': self.filename,
            # 'text_truncated': text[:200].replace("\n", " "),
            'words': words,
            'char_cnt': float(char_count),
            'word_cnt': float(word_count),
            'sentence_cnt': float(sentence_count),
            'syllable_cnt': float(syllable_count),
            'complex_word_cnt': float(complexwords_count),
            'avg_words_p_sentence': float(avg_words_p_sentence),
            'encoding': encoding_dict['encoding'],
            'encoding_confidence': encoding_dict['confidence']
Exemple #12
def SPIMI(docs, details=False):
    ans = {}
    cur = 0
    tmp = []
    for doc in docs:
        cnt = 0
        with open(doc, 'r') as f:
            s =
        stop_list = gen_default_stop(s)
        words = get_words(s, stop_list)
        indexing = invert_index(words, delta=False)
        for key, value in indexing.items():
            if key not in ans:
                ans[key] = list(np.array(value) + cur)
                ans[key] += list(np.array(value) + cur)

            cnt += len(value)
        cur += cnt

    for key, value in ans.items():
        for i in range(1, len(value)):
            value[i] -= value[i - 1]

    if details:
        res = {'词项数目': len(ans), '文档数量': len(docs), \
                '词条数量': cur, '文档长度': tmp, '文档平均长度': np.average(tmp)}
        return ans, res
    return ans
 def __get_words_summary(self, summary_text):
     ''' Gets unique words of the summary '''
     self.__words_summary = utils.get_words(summary_text)
     for word in self.__words_summary:
         if word in self.__unique_words_summary:
             self.__unique_words_summary[word] += 1
             self.__unique_words_summary[word] = 1
Exemple #14
 def __init__(self, fold):
     self.fold = fold
     #print('fold {}'.format(fold))
     #print('getting data paths')
     self.X_train_pos, self.X_test_pos = get_data_paths(
         data_positive, test_data_start, numfolds, self.fold)
     self.X_train_neg, self.X_test_neg = get_data_paths(
         data_negative, test_data_start, numfolds, self.fold)
     #print('building vocabulary')
     self.vocab = Counter()
     # filtering words under threshold
     words_to_delete = set()
     ft = 0
     if bigrams:
         #    print('generating vocabulary for bigrams')
         for path in self.X_train_pos + self.X_train_neg:
             message = open_file(path)
             words = get_words(message)
             for i, word in enumerate(words):
                 if i != 0:
                     bigram = words[i - 1] + ' ' + word
                     self.vocab[bigram] += 1
         ft = frequency_cutoff_bigram
         for word in self.vocab:
             if self.vocab[word] < ft:
     if unigrams:
         #    print('generating vocabulary for unigrams')
         for path in self.X_train_pos + self.X_train_neg:
             message = open_file(path)
             words = get_words(message)
             for word in words:
                 self.vocab[word] += 1
         ft = frequency_cutoff_unigram
         for word in self.vocab:
             if self.vocab[word] < ft:
     for word in words_to_delete:
         del (self.vocab[word])
     #print('initializing rest of variables')
     self.vocab_size = len(set(self.vocab))
     self.p_word_given_pos = dict()
     self.p_word_given_neg = dict()
     self.p_pos = 0
     self.p_neg = 0
     self.k = k
Exemple #15
 def __get_words_summary(self, summary_text):
     ''' Gets unique words of the summary '''
     self.__words_summary = utils.get_words(summary_text)
     for word in self.__words_summary:
         if word in self.__unique_words_summary:
             self.__unique_words_summary[word] += 1
             self.__unique_words_summary[word] = 1
 def __get_keywords_positions(self, text_document):
     word_list = utils.get_words(text_document)
     position_list = dict()
     for i in range(len(word_list)):
         if word_list[i] in self.__keywords_list:
             if not word_list[i] in position_list:
                 position_list[word_list[i]] = list()
     return position_list
def relable_transformer(val, keywords, no_hit_to_null=True):
    val_clean = ' '.join(get_words(val))
    for keyword, repl in keywords:
        if keyword in val_clean:
            return repl
    if no_hit_to_null:
        return na_value
        return val
 def __frequency_word(self, text_document):
     word_list = utils.get_words(text_document)
     frequency_word = dict()
     for word in word_list:
         if word in frequency_word:
             frequency_word[word] += 1
             frequency_word[word] = 1
     return frequency_word
Exemple #19
 def create_position_list(self):
     ''' Obtains the positions in the document of the words summary: 'S1_W1'
     S1 = position of the sentence, W1 = position of word in the sentences '''
     for i in range(len(self.__document_sentence_list)):
         words = utils.get_words(self.__document_sentence_list[i])
         for j in range(len(words)):
             if words[j] in  self.__unique_words_summary:
                 if not words[j] in self.__position_word_list:
                     self.__position_word_list[words[j]] = list()
Exemple #20
def prepare_chn_data(args):
    settings = parse_settings(args.setting)
        chn_file = os.path.join(ENV.data_dir, "%s" % settings['chn'])
        combinations = os.path.join(ENV.data_dir, "combination_%s" % settings['chn'])
        words = utils.get_words(chn_file)
        for i in range(2,3):
            for expression in list(itertools.combinations(words, i)):
                utils.push_word_back(combinations, ''.join(expression))
    except Exception as e:
        print e
    def __get_sense_units_atr(self, summary_sentence, document_sentence):
        ''' Return the proportions of equals words'''
        words_summary = utils.get_words(summary_sentence, stop_words=True)
        words_document = utils.get_words(document_sentence, stop_words=True)
        #print("words_summary", words_summary)
        #print("words_sentence", words_sentence)
        unique_words = dict()
        cont = 0
        tmp = list()
        for word_summary in words_summary:
            if not word_summary in unique_words:  # to avoid repetitions
                unique_words[word_summary] = 1
                for word_document in words_document:
                    if word_summary in self.__tep_synonyms and word_document in self.__tep_synonyms[
                        #print("synonyms", word_summary, word_sentence)
                        tmp.append((word_summary, word_document))
                        cont += 1

        return cont  #(cont, tmp)
 def create_position_list(self):
     ''' Obtains the positions in the document of the words summary: 'S1_W1'
     S1 = position of the sentence, W1 = position of word in the sentences '''
     for i in range(len(self.__document_sentence_list)):
         words = utils.get_words(self.__document_sentence_list[i])
         for j in range(len(words)):
             if words[j] in self.__unique_words_summary:
                 if not words[j] in self.__position_word_list:
                     self.__position_word_list[words[j]] = list()
                     str(i + 1) + '_' + str(j + 1))
 def build_markov(self, file_name):
     previous = None
     self.word_list = utils.get_words(file_name)
     for word in self.word_list:
         if previous is None:
             previous = word
         if previous not in self:
             self[previous] = Dictogram()
         previous = word
Exemple #24
def count_file(filename):
    freqs = Counter()
    with open(filename, 'r') as file:
        chunk_reader = partial(_read_chunk_full_words, file)

        for chunk in iter(chunk_reader, ''):
            words = get_words(chunk)

    return frequencies_string(freqs)
Exemple #25
def start_classify(path, classifier):
    output_folder = prepare_output()
    words = utils.get_words(path)
    for word in words:
        #word = post_processing
        if classifier.classify(word) == 'words':
            utils.push_word_back(os.path.join(output_folder, "words.txt"), word)
        elif classifier.classify(word) == 'pinyin':
            utils.push_word_back(os.path.join(output_folder, "pinyin.txt"), word)
Exemple #26
def get_bag_of_symbols(max_len, string):
    words = get_words(string)
    bag = []
    window_size = 1
    while window_size <= max_len:
        for s_i in range(0, len(words) - window_size + 1):
            symbol = []
            for win_i in range(s_i, s_i + window_size):
        window_size = window_size + 1
    return bag
Exemple #27
 def __init__(self, board=None):
     if board is None:
         words = get_words()
         sample = random.sample(words, 25)
         labels = Board.get_label_list()
         self.board = [
             Word(word, label) for (word, label) in zip(sample, labels)
         self.board = board
     self.starting_team = Tags.BLUE if len(self.get_words(Tags.BLUE)) > len(
         self.get_words(Tags.RED)) else Tags.RED
Exemple #28
def frequency_analysis():
    global result
    words = utils.get_words(search_string)
    frequencies = utils.get_frequencies()
    input_frequencies = utils.get_frequencies(words, "input")

    input_first_letter = input_frequencies[0]

    for i in frequencies:
        delta = ord(input_first_letter[0]) - ord(i[0])
        if result.percentage == 100:
Exemple #29
    def do_preprocess(self, url_list, label_list):
        :param url_list:
        :param label_list:
        if MIN_WORD_FREQ > 0:
            x__, word_reverse_dict = get_word_vocab(url_list, MAX_LENGTH_WORDS,
            self.high_freq_words = sorted(list(word_reverse_dict.values()))

        self.x, self.word_reverse_dict = get_word_vocab(
            url_list, MAX_LENGTH_WORDS)
        word_x = get_words(self.x, self.word_reverse_dict, DELIMIT_MODE,
        self.ngramed_id_x, self.ngrams_dict, self.worded_id_x, self.words_dict = \
            ngram_id_x(word_x, MAX_LENGTH_SUBWORDS, self.high_freq_words)
        self.chars_dict = self.ngrams_dict
        self.chared_id_x = get_char_id_x(url_list, self.chars_dict,

        pos_x, neg_x = list(), list()
        for index in range(len(label_list)):
            label = label_list[index]
            if label == 1:
        print("Overall Mal/Ben split: {}/{}".format(len(pos_x), len(neg_x)))
        pos_x = np.array(pos_x)
        neg_x = np.array(neg_x)

        self.x_train, self.y_train, self.x_test, self.y_test = prep_train_test(
            pos_x, neg_x, DEV_PERCENTAGE)

        self.x_train_char = get_ngramed_id_x(self.x_train, self.ngramed_id_x)
        self.x_test_char = get_ngramed_id_x(self.x_test, self.ngramed_id_x)

        self.x_train_word = get_ngramed_id_x(self.x_train, self.worded_id_x)
        self.x_test_word = get_ngramed_id_x(self.x_test, self.worded_id_x)

        self.x_train_char_seq = get_ngramed_id_x(self.x_train,
        self.x_test_char_seq = get_ngramed_id_x(self.x_test, self.chared_id_x)

        self.dump_dict(self.ngrams_dict, NGRAMS_DICT_FILE)
        self.dump_dict(self.words_dict, WORDS_DICT_FILE)
        self.dump_dict(self.chars_dict, CHARS_DICT_FILE)
Exemple #30
    def __read_documents(self, id_class, class_path):
        size = len(os.listdir(class_path))
        for name_document in os.listdir(class_path):
            text_document = utils.read_file(
                os.path.join(class_path, name_document))
            word_list = utils.get_words(text_document)

            for word in word_list:
                if word in self.__classes[id_class]:
                    self.__classes[id_class][word] += 1
                    self.__classes[id_class][word] = 1

        for key in self.__classes[id_class].keys():
            self.__classes[id_class][key] /= size
    def __fill_class(self, id_class, name_document, text_document):
        word_list = utils.get_words(text_document)

        for word in word_list:
            tag = nltk.pos_tag([word])
            if tag[0][1].startswith('N') or tag[0][1].startswith(
                    'V') or tag[0][1].startswith('S') or tag[0][1].startswith(
                        'F') or tag[0][1].startswith('J'):
                if word in self.__classes[id_class]:
                    if name_document in self.__classes[id_class][word]:
                        self.__classes[id_class][word][name_document] += 1
                        self.__classes[id_class][word][name_document] = 1
                    self.__classes[id_class][word] = dict()
                    self.__classes[id_class][word][name_document] = 1
def compute_p_class_given_input(input_path, p_word_given_class, p_class):
    Return P(class | input).
    message = open_file(input_path)
    words = get_words(message)

    p_class_given_input = 0.0
    for word in words:
        if (word in p_word_given_class):
            p_class_given_input += ln(p_word_given_class[word] * p_class)
            p_class_given_input += ln(p_word_given_class['UNKNOWN_WORD'] *

    return p_class_given_input
Exemple #33
    def do_preprocess(self, url_list):
        :param url_list:
        self.chars_dict = self.load_dict(CHARS_DICT_FILE)
        self.ngrams_dict = self.load_dict(NGRAMS_DICT_FILE)
        self.words_dict = self.load_dict(WORDS_DICT_FILE)

        x, word_reverse_dict = get_word_vocab(url_list, MAX_LENGTH_WORDS)
        word_x = get_words(x, word_reverse_dict, DELIMIT_MODE, url_list)

        self.ngramed_id_x, self.worded_id_x = \
            ngram_id_x_from_dict(word_x, MAX_LENGTH_SUBWORDS, self.ngrams_dict, self.words_dict)
        self.chared_id_x = get_char_id_x(url_list, self.chars_dict,
Exemple #34
    def analyze_text(self, text):
        words = get_words(text)
        char_count = get_char_count(words)
        word_count = len(words)
        sentence_count = len(get_sentences(text))
        syllable_count = count_syllables(words)
        complexwords_count = count_complex_words(text)
        avg_words_p_sentence = word_count/sentence_count

        self.analyzedVars = {
            'words': words,
            'char_cnt': float(char_count),
            'word_cnt': float(word_count),
            'sentence_cnt': float(sentence_count),
            'syllable_cnt': float(syllable_count),
            'complex_word_cnt': float(complexwords_count),
            'avg_words_p_sentence': float(avg_words_p_sentence)
Exemple #35
 def analyze_text(self, text):
     words = get_words(text)
     char_count = get_char_count(words)
     words_count = len(words)
     sentence_count = len(get_sentences(text))
     syllable_count = count_syllables(words)
     print("syllable_count:", syllable_count)
     complex_words_count = count_complex_words(text)
     avg_words_per_sentence = int(words_count / sentence_count)
     print("avg_words_per_sentence", avg_words_per_sentence)
     self.ana_vars = {
         'words': words,
         'char_count': float(char_count),
         'words_count': float(words_count),
         'sentence_count': float(sentence_count),
         'syllable_count': float(syllable_count),
         'complex_words_count': float(complex_words_count),
         'avg_words_per_sentence': float(avg_words_per_sentence)
    def analyze_text(self, text):
        words = get_words(text)
        char_count = int(get_char_count(words))
        word_count = int(len(words))
        sentence_count = int(len(get_sentences(text)))
        syllable_count = count_syllables(words)
        complexwords_count = count_complex_words(text)
        avg_words_p_sentence = word_count / sentence_count

        self.analyzedVars = {
            'filename': self.filename,
            # 'text_truncated': text[:200].replace("\n", " "),
            'words': words,
            'char_cnt': float(char_count),
            'word_cnt': float(word_count),
            'sentence_cnt': float(sentence_count),
            'syllable_cnt': float(syllable_count),
            'complex_word_cnt': float(complexwords_count),
            'avg_words_p_sentence': float(avg_words_p_sentence)
Exemple #37
def compute_p_class_given_input(input_path, p_word_given_class, p_class):
    Return P(class | input).

    # Implement your solution here

    words = get_words(open_file(input_path))
    p_class_given_input = 0

    for word in words:
        prob = p_word_given_class["UNKNOWN_WORD"]
        if word in p_word_given_class:
            prob = p_word_given_class[word]
        p_class_given_input += math.log(prob)

    p_class_given_input += math.log(p_class)

    return p_class_given_input
Exemple #38
def create_corpus(corpus_path):
    if os.path.exists(corpus_path):
        print(f'Using corpus {corpus_path}')
        print(f'Creating corpus {corpus_path}')

        word_list = [w.lower() for w in get_words(prefix='../')]
        replacements = {}
        for word in word_list:
            if ' ' in word:
                replacements[word] = word.replace(' ', '_')

        corpus_files = []
        for path, dirs, files in os.walk(CORPUS_FILE_PATH):
            corpus_files = [f for f in files if f.endswith('.gz')]


        sentences = []
        for corpus_file in tqdm(corpus_files):
            with'{CORPUS_FILE_PATH}/{corpus_file}', 'rb') as f_in:
                for line in f_in:
                    line = line.decode('utf-8').rstrip()
                    for r in replacements:
                        line = re.sub(rf'\b{r}\b', replacements[r], line)

        sentence_order = list(range(len(sentences)))

        print('writing corpus to file...')
        f_out =, 'wb')
        for i in sentence_order:
            f_out.write((sentences[i] + '\n').encode('utf-8'))
        print('done writing corpus to file')
Exemple #39
def flesch_kincaid_score(article):
	xml_url = '&titles='.join([xml_api_url, title])
		xml = requests.get(xml_url).content
		bs = BeautifulSoup(xml)

			text = str(bs.find('extract').contents[0].encode('utf-8'))	# convert NavigableString to string after encoding
			non_text = ['== See also ==\n', '== References ==\n', ' === Further references ===\n', '== External links ==\n', '== Notes ==\n']
			for ele in non_text:
				text = text.split(ele, 1)[0]
			text = re.sub('==.*==', '', text)
			words = get_words(text)
			syllableCount = count_syllables(text)
			sentences = get_sentences(text)
			fk = 206.835 - 1.015*len(words)/len(sentences) - 84.6*(syllableCount)/len(words)
			return float(format(fk,'.2f'))
			print 'Error while computing fk score of ' + article
			print format_exc()

		print 'Error while fetching xml content of ' + article
		print format_exc()
Exemple #40
def main():
    words = get_words()[:100]
    item = words[0]
    hashtable = HashTable(words, hashsize=10)
    print hashtable
Exemple #41
import graph
import codecs
import utils
import rhymes
import generator

def get_args():
    parser = argparse.ArgumentParser(description="Generate a nice poem :)")
    parser.add_argument('--source_text', default='data/PanTadeusz.txt')
    parser.add_argument('--syllable_count', type=int, default=13)
    parser.add_argument('--rhyme_span', type=int, default=2)
    parser.add_argument('--length', type=int, default=4)
    parser.add_argument('--markov_order', type=int, default=1)
# parser.add_argument('--rhyme_pattern')
# parser.add_argument('--keyword_file')

    return parser.parse_args()

if __name__ == '__main__':
    args = get_args()
    words = []
    with, 'rb', encoding='utf8') as f:
        for l in f:
            words += utils.get_words(l.rstrip())
    wg = graph.from_file(words, args.markov_order)
    rhs = rhymes.from_file(words, args.rhyme_span)
    poem = generator.create_poem(wg, rhs, args.syllable_count, args.length)
unique_values = defaultdict( set )

n = 0
for line in reader:
	for i in indexes2binarize:
		value = line[i]
		if pass_filter( value, headers[i] ):
			unique_values[i].add( value )
	# the same, but first get unique words from the column text
	# could also use non-unique words, as unique_values[i] is a set
	# using approach no2
	for i in indexes2tokenize:
		text = line[i]
		words = get_words( text )
		# filter
		words = filter_words( words, headers[i] )
		for w in words:
			unique_values[i].add( w )
	n += 1
	if n % 10000 == 0:
		print n	

for i in unique_values:
	print "%s: %s" % ( i, len( unique_values[i] ))
# calculate column offsets