Example #1
0
def findBestWords(wordsInCategories, scoreFunction=BigramAssocMeasures.chi_sq, max_words=1000):
    word_fd = FreqDist()
    label_word_fd = ConditionalFreqDist()

    for category, words in wordsInCategories:
        word_fd.update(words)
        label_word_fd[category].update(words)

    word_counts = {}
    for condition in label_word_fd.conditions():
        word_counts[condition] = label_word_fd[condition].N()

    total_word_count = 0
    for condition, count in word_counts.items():
        total_word_count += count

    word_scores = {}

    for word, freq in word_fd.items():
        score = 0
        for condition, count in word_counts.items():
            score += scoreFunction(label_word_fd[condition][word], (freq, word_counts[condition]), total_word_count)
        word_scores[word] = score

    best = sorted(word_scores.items(), key=lambda t: t[1], reverse=True)[:max_words]
    return set([w for w, s in best])
Example #2
0
def extract_ngrams(all_sentences: List[List[str]]) -> Tuple[Any, Any]:
    unigram_freqs = FreqDist()
    bigram_freqs = FreqDist()
    for sentence in all_sentences:
        unigram_freqs.update(ngrams(sentence, 1))
        bigram_freqs.update(ngrams(sentence, 2))
    return unigram_freqs, bigram_freqs
def load_english_frequencies():
    nltk.download(['brown', 'gutenberg', 'reuters'])

    global english_frequencies
    english_frequencies = FreqDist(w.lower() for w in brown.words())
    english_frequencies.update(w.lower() for w in gutenberg.words())
    english_frequencies.update(w.lower() for w in reuters.words())
Example #4
0
    def _prepare(self):
        if self._is_prepared:
            return

        freq_dist_a = FreqDist()
        for a in self._pair.chunks_a:
            freq_dist_a.update(self._tokenize(a))

        freq_dist_b = FreqDist()
        for b in self._pair.chunks_b:
            freq_dist_b.update(self._tokenize(b))

        self._avg_freq_dist = FreqDist()
        n_a = freq_dist_a.N()
        n_b = freq_dist_b.N()
        for a in freq_dist_a:
            self._avg_freq_dist[a] = (freq_dist_a[a] / n_a + freq_dist_b[a] / n_b) / 2.0
        for b in freq_dist_b:
            if self._avg_freq_dist[b] != 0.0:
                continue
            self._avg_freq_dist[b] = (freq_dist_a[b] / n_a + freq_dist_b[b] / n_b) / 2.0

        self._chunks = self._sampler.generate_chunk_pairs(self._pair)

        self.__freq_a = None
        self.__freq_b = None

        self._is_prepared = True
Example #5
0
def pos_ngrams(t1, t2, order=3):
    """
    Generate POS n-gram distributions.

    :param t1: text1
    :param t2: text2
    :param order: n-gram order
    :return: tuple containing FreqDists
    """

    t1_freq = FreqDist()
    t2_freq = FreqDist()

    t1 = nltk.sent_tokenize(t1)
    for s in t1:
        pos_tags = nltk.pos_tag(nltk.word_tokenize(s))
        t1_freq.update(
            tuple(map(lambda x: x[1], pos_tags[i:i + order]))
            for i in range(len(pos_tags) - order + 1))

    t2 = nltk.sent_tokenize(t2)
    for s in t2:
        pos_tags = nltk.pos_tag(nltk.word_tokenize(s))
        t2_freq.update(
            tuple(map(lambda x: x[1], pos_tags[i:i + order]))
            for i in range(len(pos_tags) - order + 1))

    return t1_freq, t2_freq
Example #6
0
def word_distr(category_tweets):
    word_dist = FreqDist()
    for tweet in category_tweets:
        tokens = tokenizer.tokenize(tweet.text)
        tokens = [x for x in tokens if x not in stop_words and not is_punctuation(x)]
        dist = FreqDist(tokens)
        word_dist.update(dist)
    return word_dist
Example #7
0
 def ngram_freq(self, speaker, token_count=1):
     """Return a FreqDist of ngrams of length token_count for speaker."""
     freq = FreqDist()
     for line in self.all_lines(speaker):
         for sent in line.sentences:
             freq.update(" ".join(ngram)
                         for ngram in ngrams(sent.tokenize(), token_count))
     return freq
Example #8
0
def compute_freq(text, N_gram):

    bigramfdist = FreqDist()
    threeramfdist = FreqDist()

    tokens = text.strip().split(' ')
    bigrams = ngrams(tokens, N_gram)
    bigramfdist.update(bigrams)
    return bigramfdist
Example #9
0
    def __init__(self, params, picklefile, modelfile=None):
        texts = []
        with open(params['PATH_TRAIN']) as inf:
            for line in inf:
                temp = line.replace("\n", "")
                texts.append(temp[temp.index('\t') + 1:].lower())

        word_dist = FreqDist()
        for s in texts:
            word_dist.update(s.split())
        word_freq = dict(word_dist)
        word_index = {}
        c = 1
        for t in word_freq:
            word_index[t] = c
            c = c + 1
        pickle.dump(word_index, open("word_pickle.p", "wb"))

        word_prob = {}
        for t in word_freq:
            word_prob[t] = 1 / word_freq[t]
        pickle.dump(word_index, open("word_freq.p", "wb"))

        self.word_freq = word_freq
        self.word_index = word_index

        embeddings_index = {}
        f = open(os.path.join(params['GLOVE_DIR'], 'glove.6B.50d.txt'))
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
        f.close()
        # Obtained all the word embeddings. Fetch for word 'w', as embeddings_index[w]

        embedding_matrix = np.zeros(
            (params['VOCAB_SIZE'], params['EMBEDDING_DIM']))
        with open(picklefile, 'rb') as f:
            self.encoding = pickle.load(f, encoding='latin1')
        # self.encoding = pickle.load(open(picklefile, 'rb'))
        self.word_index = word_index
        self.embedding_matrix = embedding_matrix
        self.params = params

        if (modelfile == None):
            self.model = Sequential()
            embedding_layer = Embedding(
                params['VOCAB_SIZE'],
                params['EMBEDDING_DIM'],
                weights=[embedding_matrix],
                input_length=params['MAX_SEQUENCE_LENGTH'],
                trainable=False)
            self.model.add(embedding_layer)
        else:
            self.model = load_model(modelfile)
Example #10
0
    def get_stats(self, output_fname):
        fd = FreqDist()
        for text in self.texts:
            fd.update(set(text))

        fh = open(output_fname, 'w')
        text = Text(self.paragraph_tokens)
        fdist = FreqDist(text)
        for (w,f) in fdist.iteritems():
            print >> fh, "%s\t%i" % (w, f)
        fh.close()
Example #11
0
def frecuencias_terminos(tokens):
    term_freq = FreqDist()
    for i in xrange(len(tokens)):
        for j in tokens[i]:
            term_freq.update(FreqDist(j))
    y = [count for tag, count in term_freq.most_common(30)]
    x = range(1, len(y) + 1)
    print term_freq.most_common(30)
    plt.bar(x, y)
    plt.title("Frecuencias de los terminos")
    plt.ylabel("Frecuencia")
    plt.show()
def compute_freq(text_body, ngram_n=6):
    stop_words = set(stopwords.words('english'))
    n_gramfdist = FreqDist()
    for line in text_body:
        if len(line) > 1:
            tokens = line.strip().split(' ')
            # tokens_without_stops = [x.lower() for x in tokens if x.lower() not in stop_words]
            # n_grams = ngrams(tokens_without_stops, 3)
            n_grams = ngrams(tokens, ngram_n)
            n_gramfdist.update(n_grams)

    return n_gramfdist
Example #13
0
class BiWordExtractor:
    def __init__(self, pickle_file):
        self._statuses = pickle.load(open(pickle_file, 'rb'))
        self._averages = dict()
        self._gender_stats = dict()
        self.fdistneuro = FreqDist()
        self.fdistnonneuro = FreqDist()
        self.highneuro = defaultdict()
        self.highnonneuro = defaultdict()

    """
    Processes statuses. (For information on how the different data structures
    are set up, look at the comments for the getters.)
    """

    def wordprocess(self):
        lengths = dict()
        row = 0
        for status in self._statuses[1:]:
            row += 1
            print row
            user = status[0]

            filtered_status = status[1].translate(string.maketrans("", ""),
                                                  string.punctuation)

            tokens = pattern_split.split(filtered_status.lower())

            filtered_tokens = [
                w for w in tokens
                if w not in stopwordslist and w not in filterlist
            ]

            bitokens = nltk.bigrams(filtered_tokens)

            if status[5] == '+':
                self.fdistneuro.update(bitokens)
            elif status[5] == '-':
                self.fdistnonneuro.update(bitokens)

    def neuro_word_frequency(self):
        vocneuro = self.fdistneuro.keys()
        highvocneuro = vocneuro[:300]
        return highvocneuro

    def highneuro_word_frequency(self):
        for w in self.neuro_word_frequency():
            if self.fdistneuro[w] >= 5:
                self.highneuro[w] = self.fdistneuro[w]

        print self.highneuro.items()
        print self.highneuro.keys()
        return self.highneuro.keys()
Example #14
0
def get_uni(first, second, uni):
    bigramfdist = FreqDist()
    for line in first:
        token = nltk.word_tokenize(line)
        token = [
            x for x in token
            if not re.fullmatch('[' + string.punctuation + ']+', x)
        ]
        bigrams = ngrams(token, 1)
        bigramfdist.update(bigrams)

    print(bigramfdist.most_common(50))
    print(bigramfdist.get("but"))
def get_ngrams(fileLines, n, pos_tag_dict):
    # Get n gram counts for corpus
    tokens = [];
    ngram_counts = FreqDist();
    for excerpt in fileLines:	 	
	ngram_counts_exp = get_ngram_counts_per_excerpt(excerpt,n,pos_tag_dict);	
	for ngram in ngram_counts_exp:
        	if ( ngram in ngram_counts ):
			val = ngram_counts[ngram];
		else:	
			val = 0;	
		ngram_counts[ngram] = val + ngram_counts_exp[ngram]; 
			
    ngram_counts.update(ngram_counts_exp);
 
    return ngram_counts;
Example #16
0
 def train_finder(self, all_listings):
     """
     Train the product identification algorithm with example data.
     """
     logging.info("Start training of recognizer for product: {0}"
                  .format(self.product_id))
     self.classifier = None
     
     #select example listings for the finder's product
     listings, n_pos, n_neg = self.filter_trainig_samples(all_listings)
     logging.info("Number listings: {l}, positive: {p}, negative: {n}; "
                  "features: {f}"
                  .format(l=len(listings), p=n_pos, n=n_neg,
                          f=self.n_features))
     if len(listings) < 30:
         logging.warn("Product {0}. Can't compute classifier. "
                      "Too few listings."
                      .format(self.product_id))
         return
     elif n_pos < 10:
         logging.warn("Product {0}. Can't compute classifier. "
                      "Too few positive listings."
                      .format(self.product_id))
         return
     elif n_neg < 10:
         logging.warn("Product {0}. Can't compute classifier. "
                      "Too few negative listings."
                      .format(self.product_id))
         return
     
     #Create list of most common words, and put it into feature extractor
     #TODO: remove stop-words
     self.feature_extractor = FeatureExtractor()
     word_freqs = FreqDist()
     for _, listing in listings.iterrows():
         words = self.feature_extractor.extract_words(listing)
         word_freqs.update(words)
     common_words = word_freqs.keys()[:self.n_features]
     self.feature_extractor = FeatureExtractor(common_words)
     logging.debug("Number individual words: {0}; hapaxes: {1}"
                   .format(len(word_freqs), len(word_freqs.hapaxes())))
     logging.debug("Most common words: {}".format(word_freqs.keys()[:100]))
     
     #Train the classifier
     train_set = self.create_labeled_features(listings)
     self.classifier = nltk.NaiveBayesClassifier.train(train_set)
     self.classifier.show_most_informative_features(20)
Example #17
0
def create_vocabulary(_text, rmv_stop_wrds):
    # create an empty network of model's vocabulary
    def init_vocab_network(n_inputs):
        network = list()
        for i in range(0, n_inputs):
            layer = {'value': 0, 'token': ''}
            network.append(layer)
        return network

    # given a list of words, return a dictionary of word-frequency pairs.
    def wordlist_to_freq_dict(wrdlist):
        wordfreq = [wrdlist.count(p) for p in wrdlist]
        return dict(zip(wrdlist, wordfreq))

    # sort the dictionary of word-frequency pairs in descending order
    def sort_freq_dict(freqdict):
        aux = [(freqdict[key], key) for key in freqdict]
        aux.sort()
        aux.reverse()
        return aux

    if rmv_stop_wrds:
        print('removing stop words...')
        tokenized_text = nltk.word_tokenize(_text)
        stopwords = nltk.corpus.stopwords.words('english')
        word_freq = nltk.FreqDist(tokenized_text)
        dict_filter = lambda word_freq, stopwords: dict(
            (word, word_freq[word]) for word in word_freq
            if word not in stopwords)
        wordlist = dict_filter(word_freq, stopwords)
    else:
        wordlist = FreqDist()
        wordlist.update(_text.split())

    sort_freq_list = sort_freq_dict(wordlist)
    # initiate model's vocabulary
    _voc = init_vocab_network(len(sort_freq_list))

    # update vocabulary values
    j = 0

    for index in sort_freq_list:
        # plus one to avoid the zero padding
        _voc[j]['value'] = j + 1
        _voc[j]['token'] = index[1]
        j += 1
    return _voc, len(_voc)
Example #18
0
def ngram_probs(filename='D:/raw_sentences.txt'):
	textfile = open(filename)

	bigram_fdist = FreqDist()
	threegram_fdist = FreqDist()

	for line in textfile:
		if len(line) > 1:
			tokens = line.lower().strip().split(' ')

		bigrams = ngrams(tokens, 2)
		bigram_fdist.update(bigrams)
		
		threegrams = ngrams(tokens, 3)
		threegram_fdist.update(threegrams)	
		
	return bigram_fdist,threegram_fdist
Example #19
0
def check_svc_bef_aft(list_line, command):
    # check the freq of words before and after bus service
    # check the freq of words before and after of word (number) which is non bus svc
    text = ''
    for i in range(0, len(list_line), 3):
        split_first = 0
        split_second = 0

        if i % 3 == 0:
            split_first = list_line[i].strip().split('\t')
        j = i + 1
        if j % 3 == 1:
            split_second = list_line[j].strip().split('\t')

        for k in range(0, len(split_second)):
            if command == 'before_svc':
                if int(split_second[k]) == 1:  # mean bus svc
                    if command == 'before_svc':
                        if k > 0:  # bus svc doesn't appear at the first position of sentences
                            text = text + split_first[k - 1].lower() + ' '  # take the word before
                print i, k, split_first[k]

            if command == 'after_svc':
                if int(split_second[k]) == 1:  # mean bus svc
                    if command == 'after_svc':
                        if k < len(split_second) - 1:
                            text = text + split_first[k + 1].lower() + ' '  # take the word after

            if command == 'before_notsvc':
                if RepresentsInt(split_first[k]) is True and int(split_second[k]) != 1:  # text is a number and not a bus svc
                    if k > 0:  # bus svc doesn't appear at the last position of sentences
                        text = text + split_first[k - 1].lower() + ' '

            if command == 'after_notsvc':
                if RepresentsInt(split_first[k]) is True and int(split_second[k]) != 1:  # text is a number and not a bus svc
                    if k < len(split_second) - 1:  # bus svc doesn't appear at the last position of sentences
                        text = text + split_first[k + 1].lower() + ' '

    fdist = FreqDist()
    tokens = word_tokenize(str(text))
    fdist.update(tokens)
    for value in fdist.most_common(len(fdist)):
        print value[0], '\t', value[1]

    print text
Example #20
0
def get_vocab(series, addtional_tokens=[], top=None):
    """
    extract the vocabulary out of an array, allow to add additional tokens to the vocabulary and choose only the top n frequent words.
    :param series: array of sentences
    :param addtional_tokens: additional tokens we want to include in the vocabulary
    :param top: top n frequent words we want to include in the vocabulary
    :return: map from a word to its numeric representation and the opposite map
    """
    rev_vocab = addtional_tokens
    freq_vocab = FreqDist()
    for s in tqdm(series):
        freq_vocab.update(word_tokenize(decontracted(s)))
    print("Original vocab size %s" % len(freq_vocab))
    all_words_sorted = sorted(freq_vocab, key=freq_vocab.get, reverse=True)
    top_words = all_words_sorted[:top]
    rev_vocab += top_words
    vocab = {word: index for index, word in enumerate(rev_vocab)}
    return vocab, rev_vocab
Example #21
0
def get_vocab(series, addtional_tokens=[], top=None):
    """
    extract the vocabulary out of an array, allow to add additional tokens to the vocabulary and choose only the top n frequent words.
    :param series: array of sentences
    :param addtional_tokens: additional tokens we want to include in the vocabulary
    :param top: top n frequent words we want to include in the vocabulary
    :return: map from a word to its numeric representation and the opposite map
    """
    rev_vocab = addtional_tokens
    freq_vocab = FreqDist()
    for s in tqdm(series):
        freq_vocab.update(word_tokenize(decontracted(s)))
    print("Original vocab size %s" % len(freq_vocab))
    all_words_sorted = sorted(freq_vocab, key=freq_vocab.get, reverse=True)
    top_words = all_words_sorted[:top]
    rev_vocab += top_words
    vocab = {word: index for index, word in enumerate(rev_vocab)}
    return vocab, rev_vocab
Example #22
0
def get_frequency_distribution(docs, n=1):
    """ Get the n-gram terms frequency distribution from a list of strings

    Paramaters:
        docs: list of strings
        n: an integer

    Returns:
        nltk.FreqDist
    """
    ngram_freq_dist = FreqDist()

    for doc in docs:
        if isinstance(doc, str):
            tokens = word_tokenize(doc)
            ngram_tokens = ngrams(tokens, n)
            ngram_freq_dist.update(ngram_tokens)

    return ngram_freq_dist
Example #23
0
    def __prepare_vocabulary(self, train_captions):
        vocab = FreqDist()
        for caption in train_captions:
            vocab.update([
                token.text.lower()
                for token in english_tokenizer.tokenizer(caption)
            ])
        # Histogram
        # hist, bin_edges = np.histogram(list(vocab.values()), bins=20000, density=False)
        most_common_words = list(
            map(lambda token: token[0],
                vocab.most_common(NUMBER_OF_WORDS_FOR_VOCABULARY)))

        idx = 4
        for word in most_common_words:
            if word not in self.stoi.keys():
                self.stoi[word] = idx
                self.itos[idx] = word
                idx += 1
Example #24
0
    def fetch_if(self, cond, term, pos_is_target=True, include_pair=False):

        tmp_freq_dist = FreqDist()

        conditions = {
            ng_prefix: ["pos[:-1] == term", "token[:-1] == term"],
            ng_suffix:
            ["pos[-len(term):] == term", "token[-len(term):] == term"],
            ng_contain: [
                "self._is_subcontent(term, pos)",
                "self._is_subcontent(term , token)"
            ],
            ng_equal: ["pos == term", "token == term"]
        }

        if cond not in conditions:
            cond = prefix

        # Fetching Choice Configuration
        p_key, t_key = "", ""
        if include_pair:
            p_key = "(pos, token)"
            t_key = "(token, pos)"
        else:
            p_key = "pos"
            t_key = "token"
        cmp_p = compile(p_key, '<string>', 'eval')
        cmp_t = compile(t_key, '<string>', 'eval')

        if pos_is_target:
            cmp_cond = compile(conditions[cond][0], '<string>', 'eval')
            for (token, pos), freq in self.train_data.items():
                if eval(cmp_cond):
                    tmp_freq_dist.update({eval(cmp_p): freq})
        else:
            cmp_cond = compile(conditions[cond][1], '<string>', 'eval')
            for (token, pos), freq in self.train_data.items():
                if eval(cmp_cond):
                    tmp_freq_dist.update({eval(cmp_t): freq})

        return tmp_freq_dist
def token_aft(list_line, command):
    # check the token after label, note that belongs to the command ('svc', 'road', 'busstop')
    text = ''
    list_length = []

    for i in range(0, len(list_line), 3):
        split_first = 0
        split_second = 0

        if i % 3 == 0:
            split_first = list_line[i].strip().split('\t')  # list of sentences
        j = i + 1
        if j % 3 == 1:
            split_second = list_line[j].strip().split('\t')  # list of label for each word
        list_length.append(len(split_first))

        if command == 'svc':
            for k in range(0, len(split_second)):
                # check the frequency of token before bus service
                if int(split_second[k]) == 1:  # mean bus svc
                    if k < len(split_second) - 1:  # bus svc doesn't appear at the first position of sentences
                        # try:  # don't use stemming here
                        #     stem_word = port.stem(connect_token(split_first[k - 1].lower()))  # take the token before
                        # except UnicodeDecodeError:
                        #     stem_word = connect_token(split_first[k - 1].lower())
                        stem_word = connect_token(split_first[k + 1].lower())  # take the token after label
                        if is_int(stem_word) is False:
                            text = text + stem_word + ' '

                        # if stem_word == 'sd' or stem_word == 'dd':
                        #     print list_line[i]

    fdist = FreqDist()
    tokens = word_tokenize(str(text))
    fdist.update(tokens)
    for value in fdist.most_common(len(fdist)):
        # print value[0], '\t', value[1]
        print value[0]

    print text
Example #26
0
def term_freq_all(path, name):
    file = path + '/' + name

    fdist = FreqDist()
    list_line = []
    with open(file) as f:
        for line in f:
            split_line = line.split('\t')
            words = nltk.word_tokenize(split_line[1].decode('utf-8').lower().strip())
            fdist.update(words)
            print split_line[0]

            # list_stem = []
            # for token in words:
            #     # st = LancasterStemmer()
            #     # try:
            #     #     list_stem.append(st.stem(token).decode('utf-8'))
            #     # except:
            #     #     print (split_line[0])
            #
            #     st = PorterStemmer()
            #     try:
            #         list_stem.append(st.stem(token).decode('utf-8'))
            #     except:
            #         print (split_line[0])
            # fdist.update(list_stem)


            #print (line)

    print ('==========================================')
    print ('==========================================')
    print (len(fdist))
    stop = stopwords.words('english')


    for value in fdist.most_common(15000):
        # if (value[0] not in stop and (len(value[0]) >= 4)):
        if (value[0] not in stop):
            print (str(value[0].encode('utf-8')) + '\t' + str(value[1]))
Example #27
0
def get_desc_graph(part, creator, resource, comments):
    unigramdist = FreqDist()
    bigramfdist = FreqDist()
    fc = comments

    if part != 'all':
        fc = fc[fc['primary_category'] == part]

    if creator != 'all':
        fc = fc[fc['creator_department'] == creator]

    if resource != 'all':
        fc = fc[fc['resource_type'] == resource]

    for index, sentence in fc.iterrows():
        unigrams = ngrams(sentence['tokens'], 1)
        bigrams = ngrams(sentence['tokens'], 2)
        unigramdist.update(unigrams)
        bigramfdist.update(bigrams)

    return unigram_freq_graph(unigramdist), bigram_freq_graph(bigramfdist), \
           unigram_word_cloud(unigramdist), bigram_word_cloud(bigramfdist)
Example #28
0
def term_freq_time(first, last):
    ## get the time convert in sgforum
    db = MySQLdb.connect(host="localhost", # your host, usually localhost
                         user="******", # your username
                          passwd="ducthong", # your password
                          db="sgforums_singaporebuses") # name of the data base

    # you must create a Cursor object. It will let
    #  you execute all the queries you need
    cur = db.cursor()

    # Use all the SQL you like
    sql = "select p.post_id, s.createdAtSecond, p.summary from posts_filter p, posts_createatsecond s where p.post_id = s.post_id and s.createdAtSecond >= " \
          + str(first) + " and s.createdAtSecond <= " + str(last) + " order by s.createdAtSecond;"
    cur.execute(sql) #call the database which name 'posts'

    fdist = FreqDist()
    for row in cur.fetchall():
        post_id = str(row[0])
        createdAtSecond = str(row[1])
        summary = unicode(str(row[2]), errors='ignore')
        #print (post_id + '\t' + createdAtSecond + '\t' + summary)
        words = nltk.word_tokenize(summary.lower().strip().decode('utf-8'))
        # try:
        #     words = nltk.word_tokenize(summary.lower().strip().decode('utf-8'))
        # except:
        #     print (post_id + '\t' + summary)
        fdist.update(words)
    cur.close()
    print ('==========================================')
    print ('==========================================')
    print (len(fdist))
    stop = stopwords.words('english')

    for value in fdist.most_common(200):
        if (value[0] not in stop and len(value[0]) >= 3):
            print (str(value[0]).encode('utf-8') + '\t' + str(value[1]))
Example #29
0
def count_corpus_frequency2(sentences):
    sentences_tokens = [line.strip().split(" ") for line in sentences]

    freq1 = FreqDist()
    freq12 = FreqDist()
    freq2 = FreqDist()

    for sentence in sentences_tokens:
        sentence = [x for x in sentence if x != '']
        bigrams = ["{0} {1}".format(t[0], t[1]) for t in ngrams(sentence, 2)]

        freq1.update(sentence)
        freq12.update(sentence)
        freq12.update(bigrams)
        freq2.update(bigrams)

    return freq1, freq12, freq2
Example #30
0
def check_bef_aft_roadBusStop(list_line, command):
    text = ''
    for i in range(0, len(list_line), 3):
        split_first = 0
        split_second = 0

        if i % 3 == 0:
            split_first = list_line[i].strip().split('\t')
        j = i + 1
        if j % 3 == 1:
            split_second = list_line[j].strip().split('\t')

        k = 0
        while True:
            if k >= len(split_second):
                break

            if command == 'bef_road':
                try:
                    if int(split_second[k]) == 2:  # take road
                        if k > 0:
                            text = text + connect_token(split_first[k - 1].lower()) + ' '  # take the word before

                        while True:
                            k += 1
                            if k == len(split_second):
                                break
                            else:
                                if int(split_second[k]) != 2:
                                    break
                    else:
                        k += 1
                except ValueError:
                    k += 1

            if command == 'aft_road':
                try:
                    if int(split_second[k]) == 2:  # take road
                        while True:
                            k += 1
                            if k == len(split_second):
                                break
                            else:
                                if int(split_second[k]) != 2:
                                    break
                        if k < len(split_second) - 1:
                            if is_int(split_first[k]) is False:
                                text = text + connect_token(split_first[k].lower()) + ' '  # take the token after the label
                    else:
                        k += 1

                except ValueError:
                    k += 1

            if command == 'bef_busstop':
                try:
                    if int(split_second[k]) == 3:  # take busstop
                        if k > 0:
                            text = text + connect_token(split_first[k - 1].lower()) + ' '  # take the word before

                        while True:
                            k += 1
                            if k == len(split_second):
                                break
                            else:
                                if int(split_second[k]) != 3:
                                    break
                    else:
                        k += 1
                except ValueError:
                    k += 1

            if command == 'aft_busstop':
                try:
                    if int(split_second[k]) == 3:  # take road
                        while True:
                            k += 1
                            if k == len(split_second):
                                break
                            else:
                                if int(split_second[k]) != 3:
                                    break
                        if k < len(split_second) - 1:
                            if is_int(split_first[k]) is False:
                                text = text + connect_token(split_first[k].lower()) + ' '  # take the token after the label
                    else:
                        k += 1

                except ValueError:
                    k += 1

    fdist = FreqDist()
    tokens = word_tokenize(str(text))
    fdist.update(tokens)
    for value in fdist.most_common(len(fdist)):
        print value[0], '\t', value[1]

    print text
Example #31
0
                    YOURSELVES YOU'VE"""
set_function_words = list(set(function_words.lower().split(" ")))
set_function_words += " "

files = []
i = 1
path = 'output/'

for filename in glob('Mini-CORE/*.txt'):
    with open(filename, 'r', encoding='utf8') as f:
        w_file = open(path + str(i) + ".md", 'w', encoding='utf8')
        clear = re.compile('<.*?>')
        n_clear = re.compile('\n')
        preClearText = re.sub(clear, '', f.read())
        clearText = re.sub(n_clear, '', preClearText).lower()
        # w_file.write(clearText + '\n\n')
        tokens = sorted(list(clearText.split(" ")))
        fd = FreqDist()
        for word in tokens:
            if word not in set_function_words:
                fd.update([word])
        fd_sorted = sorted(fd, key=fd.get, reverse=True)
        w_file.write('Sorted by values :\n')
        for word in fd_sorted:
            w_file.write(str(word) + ' ')
        w_file.write('\n\nList :\n')
        for word in fd:
            w_file.write(str(word) + ' : ' + str(fd[word]) + '\n')
        w_file.close()
        i += 1
Example #32
0
def main(download_settings_filename, parse_settings_filename):
    with open(download_settings_filename, 'r') as f:
        download_config = json.load(f)
    with open(parse_settings_filename, 'r') as f:
        parse_config = json.load(f)
    topic = download_config.get('topic', 'Medicine')
    data_dir = os.path.join(
        download_config.get('save_dir', os.path.join('data', 'wiki')), topic)
    save_dir = os.path.join(
        parse_config.get('save_dir', os.path.join('artifacts', 'wiki')), topic,
        'vocab')
    exclude_vocab = parse_config.get('exclude_vocab', [])
    min_page_vocab = parse_config.get('min_page_vocab', 5)
    plot_top_k = parse_config.get('plot_top_k', 40)
    plot_cumulative = parse_config.get('plot_cumulative', True)
    plot_title = 'top {} frequency'.format(
        plot_top_k) if not plot_cumulative else 'top {} cumulative'.format(
            plot_top_k)
    make_plots = plot_top_k > 0

    wiki_url = 'https://en.wikipedia.org/wiki/Category:{}'.format(topic)

    sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle').tokenize
    word_tokenizer = NISTTokenizer().tokenize
    lem = nltk.WordNetLemmatizer()

    S = requests.Session()

    pages = glob(os.path.join(data_dir, '*.html'))

    total_vocab = FreqDist()
    document_vocabs = {}
    print('reading {} files and generating vocabulary'.format(len(pages)))
    os.makedirs(save_dir, exist_ok=True)
    for page in tqdm(pages):
        l = process_page(S, page, exclude_vocab, word_tokenizer, lem,
                         sent_tokenizer)
        # ignore pages with very small vocabulary
        if len(l) < min_page_vocab:
            continue
        document_vocabs[page] = FreqDist(l)
        total_vocab.update(l)
        save_filename = os.path.join(
            save_dir,
            os.path.basename(page[:page.rfind('.')]) + '.json')
        with open(save_filename, 'w') as f:
            json.dump(dict(document_vocabs[page]), f, indent=4)
        if make_plots:
            save_filename = save_filename[:save_filename.rfind('.')] + '.pdf'
            save_freq_plot(save_filename,
                           document_vocabs[page],
                           max_num=plot_top_k,
                           cumulative=plot_cumulative,
                           title=plot_title)
    with open(os.path.join(save_dir, 'total_count.json'), 'w') as f:
        json.dump(dict(total_vocab), f, indent=4)
    if make_plots:
        save_filename = os.path.join(save_dir, 'total_count.pdf')
        save_freq_plot(save_filename,
                       total_vocab,
                       max_num=plot_top_k,
                       cumulative=plot_cumulative,
                       title=plot_title)
Example #33
0
class Corpus:
    def __init__(self, documents=None):
        """
        Corpus constructor
        documents is a list of documents
        """
        self.docs = {}
        if documents:
            for doc in documents:
                self.docs[doc.doc_id] = doc
            self.nltk_text_collection = TextCollection([x.to_nltk_text() for x in self.docs.values()])
        self.term_index = {}
        self._vocabulary = None
        self.clear_indexes()
        self.pp = pprint.PrettyPrinter(indent=4)

    #def neighbors(self, document, window_size=9)

    def __len__(self):
        return len(self.docs.keys())

    def __contains__(self, a):
        return a in self.docs

    def __getitem__(self, x):
        """Return the document with Document ID x"""
        return self.docs[x]

    def categories(self):
        """
        Returns list of categories in this corpus
        For combined corpora, categories are equivalent to document ids
        """
        return self.docs.keys()

    def _old_neighbors(self, document, window_size=9):
        """
        neighbors based on moving window
        window_size is the diameter from the index element
        that should be included in the results
        """
        if not self.sorted_by_len:
            self.generate_neighbor_list()

        if document.doc_id not in self.inverse_len_index:
            return []

        index = self.inverse_len_index[document.doc_id]

        l = len(self.sorted_by_len)
        r = window_size / 2
        start = max(0, index - r)
        end = min(l, (index + 1) + r)
        n = self.sorted_by_len[start:index] + self.sorted_by_len[(index+1):end]
        return [self.__getitem__(x) for x in n]

    def neighbors(self, document, max_distance):
        """
        neighbors based on moving window
        distance is the maximum distance from the index element
        that should be included in the results
        """
        sorted_dist_vector = self.generate_neighbor_list(document)
        filtered = filter(lambda((x, y)): y <= max_distance, sorted_dist_vector)
        return [self.__getitem__(x[0]) for x in filtered]

    def _sort_dict_by_value(self, d):
        return sorted(d.iteritems(), key=operator.itemgetter(1))

    def _sorted_dict_index(self, pairs):
        return [i for i, j in pairs]

    def add(self, document):
        """
        Add a document to this collection
        If there is any current iterator using this collection, it is
        not modified.  You need to re-initialize the iterator if you want
        to include the new items.
        """
        #print "adding " + str(document.doc_id)
        self.docs[document.doc_id] = document
        self.clear_indexes()

    def clear_indexes(self):
        self.doc_lens = None
        self.dist_matrix = None
        self.sorted_by_len = None
        self.inverse_len_index = None
        self.inverse_dist_index = None

    def generate_doc_lens(self):
        self.doc_lens = {}
        for document in self.docs.values():
            doc_id = document.doc_id
            shn = len(document)
            self.doc_lens[document.doc_id] = shn

    def char_dist(self, doc1, doc2):
        "distance function by difference between document lengths"
        return abs(self.doc_lens[doc1] - self.doc_lens[doc2])

    def generate_dist_vector(self, document, dist_func=char_dist):
        if (isinstance(document, Document)):
            doc_id = document.doc_id
        elif type(document) == str:
            doc_id = document
        if self.doc_lens == None:
            self.generate_doc_lens()
        v = {}
        for target in self.docs.keys():
            v[target] = dist_func(self, doc_id, target)
        return v

    def generate_dist_matrix(self):
        if self.doc_lens == None:
            self.generate_doc_lens()
        if self.dist_matrix == None:
            self.dist_matrix = {}
        for doc1 in self.docs.keys():
            self.dist_matrix[doc1] = generate_dist_vector(doc1)

    def _generate_neighbor_list(self):
        if self.doc_lens == None:
            self.generate_doc_lens()
        self.sorted_by_len = self._sorted_dict_index(self._sort_dict_by_value(self.doc_lens))
        self.inverse_len_index = {}
        for idx, val in enumerate(self.sorted_by_len):
            self.inverse_len_index[val] = idx

    def generate_neighbor_list(self, document):
        dist_vector = self.generate_dist_vector(document)
        return self._sort_dict_by_value(dist_vector)

    def next(self):
        if self.cursor_position >= len(self.found_docs):
            raise StopIteration
        else:
            self.cursor_position += 1
            doc = self.docs[self.found_docs[self.cursor_position - 1]]
            return doc

    def __iter__(self):
        self.cursor_position = 0
        self.found_docs = self.docs.keys()
        return self

    def to_nltk_text_collection(self):
        if self.nltk_text_collection:
            return self.nltk_text_collection
        else:
            self.nltk_text_collection = TextCollection([x.to_nltk_text() for x in self.docs.values()])
            return self.nltk_text_collection
        return None

    # wtf nltk
    def index(self):
        for k in self.docs.keys():
            for word in self.docs[k].words():
                if word in self.term_index:
                    self.term_index[word].add(k)
                else:
                    self.term_index[word] = set()
                    self.term_index[word].add(k)

    def df(self, term):
        if not self.term_index:
            self.index()
        #self.pp.pprint(self.term_index)
        if term in self.term_index:
            return len(self.term_index[term])
        else:
            return 0

    def idf(self, term):
        df = self.df(term)
        if df == 0.0:
            return 0.0
        else:
            return math.log(float(len(self)) / float(self.df(term)))

    # Use non-augmented tf for now, can experiment later
    def tf(self, doc_id, term):
        return self.docs[doc_id].tf(term)

    def tf_idf(self, doc_id, term):
        return self.tf(doc_id, term) * float(self.idf(term))

    def vocabulary(self):
        if self._vocabulary == None:
            self._vocabulary = FreqDist()
            for doc in self.docs.values():
                self._vocabulary.update(dict(doc.freq_dist()))
        return self._vocabulary

    def tf_idf_vector(self, doc_id):
        """return the TF-IDF term vector for a document
        the length of the vector is equal to the vocabulary size, not the
        number of terms in the document"""
        v = [0.0] * len(self.vocabulary())
        d = self.docs[doc_id]
        if d:
            fd = d.freq_dist()
            for idx, word in self.vocabulary():
                if word in fd:
                    v[idx] = self.tf_idf(doc_id, word)
        return v

    def ranked_terms(self, doc_id, n=None):
        """
        returns a list of the top terms by TF-IDF in a document
        if n is none, return all terms.  Otherwise return the top n
        terms.
        """
        d = self.docs[doc_id]
        if d:
            v = {}
            fd = d.freq_dist()
            for word in fd.keys():
                v[word] = self.tf_idf(doc_id, word)
        sorted_v = sorted(v.iteritems(), key=operator.itemgetter(1))
        sorted_v.reverse()

        if n != None:
            return sorted_v[0:n]
        else:
            return sorted_v

    def top_terms(self, n=5):
        r = []
        for document in self.docs.values():
            r.append(self.ranked_terms(document.doc_id, n))
        return r

    def to_scikit_learn_dataset(self):
        dataset = {}
        dataset["data"] = []
        dataset["ids"] = []
        #dataset["filenames"]
        for doc_id in self.docs.keys():
            dataset["ids"].append(doc_id)
            dataset["data"].append(unicode(self.docs[doc_id]))
        b = Bunch(DESCR=None, ids=dataset["ids"], data=dataset["data"])
        return b

    def keys_sorted_by_attribute(self, attribute="created_time"):
        """
        Return the list of document ids sorted by a document attribute
        """
        d = []
        for doc_id in self.docs.keys():
            if attribute in self.docs[doc_id].document:
                d.append((doc_id, self.docs[doc_id].document[attribute]))
        # sort the list of doc_id, attribute tuples by the attribute
        return [x[0] for x in sorted(d, key=itemgetter(1))]

    def process_pipeline(self, pipeline):
        for doc in self.docs.values():
            res = pipeline.process(doc)
Example #34
0
	def __init__(self, treebank, rootsymbol='S', wrap=False, cnf=True,
				cleanup=True, normalize=False, extratags=(),
				parser=InsideChartParser, **parseroptions):
		""" initialize a DOP model given a treebank. uses the Goodman
		reduction of a STSG to a PCFG.  after initialization,
		self.parser will contain an InsideChartParser.

		>>> tree = Tree("(S (NP mary) (VP walks))")
		>>> d = GoodmanDOP([tree])
		>>> print d.grammar
		    Grammar with 8 productions (start state = S)
			NP -> 'mary' [1.0]
			NP@1 -> 'mary' [1.0]
			S -> NP VP [0.25]
			S -> NP VP@2 [0.25]
			S -> NP@1 VP [0.25]
			S -> NP@1 VP@2 [0.25]
			VP -> 'walks' [1.0]
			VP@2 -> 'walks' [1.0]
		>>> print d.parser.parse("mary walks".split())
		(S (NP mary) (VP@2 walks)) (p=0.25)		
		
		@param treebank: a list of Tree objects. Caveat lector:
			terminals may not have (non-terminals as) siblings.
		@param wrap: boolean specifying whether to add the start symbol
			to each tree
		@param normalize: whether to normalize frequencies
		@param parser: a class which will be instantiated with the DOP 
			model as its grammar. Supports BitParChartParser.
		
		instance variables:
		- self.grammar a WeightedGrammar containing the PCFG reduction
		- self.fcfg a list of strings containing the PCFG reduction 
		  with frequencies instead of probabilities
		- self.parser an InsideChartParser object
		- self.exemplars dictionary of known parse trees (memoization)"""
		from bitpar import BitParChartParser
		nonterminalfd, subtreefd, cfg = FreqDist(), FreqDist(), FreqDist()
		ids = count(1)
		self.exemplars = {}
		if wrap:
			# wrap trees in a common root symbol (eg. for morphology)
			treebank = [Tree(rootsymbol, [a]) for a in treebank]
		if cnf:
			#CNF conversion is destructive
			treebank = list(treebank)
			for a in treebank:
				a.chomsky_normal_form() #todo: sibling annotation necessary?

		# add unique IDs to nodes
		utreebank = [(tree, decorate_with_ids(tree, ids)) for tree in treebank]

		# count node frequencies
		for tree, utree in utreebank:
			nodefreq(tree, utree, subtreefd, nonterminalfd)

		if isinstance(parser, BitParChartParser):
			lexicon = set(x for a, b in utreebank for x in a.pos() + b.pos())
			# this takes the most time, produce CFG rules:
			cfg = FreqDist(chain(*(self.goodman(tree, utree)
								for tree, utree in utreebank)))
			cfg.update("%s\t%s" % (t, w) for w, t in extratags
								if w not in lexicon)
			lexicon.update(a for a in extratags if a not in lexicon)
			# annotate rules with frequencies
			self.fcfg = frequencies(cfg, subtreefd, nonterminalfd, normalize)
			self.parser = BitParChartParser(self.fcfg, lexicon, rootsymbol,
									cleanup=cleanup, **parseroptions)
		else:
			cfg = FreqDist(chain(*(self.goodman(tree, utree, False)
							for tree, utree in utreebank)))
			probs = probabilities(cfg, subtreefd, nonterminalfd)
			#for a in probs: print a
			self.grammar = WeightedGrammar(Nonterminal(rootsymbol), probs)
			self.parser = InsideChartParser(self.grammar)
			
		#stuff for self.mccparse
		#the highest id
		#self.addresses = ids.next()
		#a list of interior + exterior nodes, 
		#ie., non-terminals with and without ids
		#self.nonterminals = nonterminalfd.keys()
		#a mapping of ids to nonterminals without their IDs
		#self.nonterminal = dict(a.split("@")[::-1] for a in 
		#	nonterminalfd.keys() if "@" in a)

		#clean up
		del cfg, nonterminalfd
all_words = FreqDist(w.lower() for w in train_set_words).keys()

def tweet_features(tweet):
    tweet_words = word_tokenize(tweet)
    features = {}
    for word in all_words:
        features['contains({})'.format(word)] = (word in tweet_words)
    return features

word_fd = FreqDist()
label_word_fd = ConditionalFreqDist()

for item in train_set:
    tweet = item[0].lower()
    words = word_tokenize(item[0])
    word_fd.update(words)
    label_word_fd[item[1]].update(words)

pos_word_count = label_word_fd['pos'].N()
neg_word_count = label_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count

word_scores = {}
 
for word, freq in word_fd.iteritems():
    pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],
        (freq, pos_word_count), total_word_count)
    neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],
        (freq, neg_word_count), total_word_count)
    word_scores[word] = pos_score + neg_score
        
        
        if word1.lower() in tokens_tags:
            tokens_tags[word1].append(tag)
        else:
            tokens_tags[word1]=[tag]
            
        tags.append(tag)
        if (tag,word1) in tag_word:
            tag_word[(tag,word1)]=tag_word[(tag,word1)]+1
        else:
            tag_word[(tag,word1)]=1
            
            
    bigrams = ngrams(tags, 2)
    tag_tag.update(bigrams)
        
     




for key,value in tag_word.items():
    
    prob_tag_word[key]=tag_word[key]/tag_count[key[0]]
    
    
for (t1,t2) in tag_tag.keys():
    
    prob_tag_tag[(t1,t2)]=tag_tag[(t1,t2)]/tag_count[key[0]]
    
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from nltk import FreqDist

sentence = 'this is a foo bar sentences and i want to ngramize it this this'
# n = 3
# list_grams = ngrams(sentence.split(), n)
#
# for grams in list_grams:
#     string = ''
#     for value in grams:
#         string = string + ' ' + value
#     print (string.strip())

fdist = FreqDist()
tokens = word_tokenize(str(sentence))
fdist.update(tokens)

for value in fdist.most_common():
    print value

i = 11
for i in range(0, 10):
    i = i + 2
    print 'testing'

text = 'Mount Batten Rd Haig Rd Sims Ave'
split_text = text.split('Rd')
for value in split_text:
    print value
def token_bef(list_line, command):
    # check the token before label, note that belongs to the command ('svc', 'road', 'busstop')
    port = PorterStemmer()
    text = ''
    list_length = []
    for i in range(0, len(list_line), 3):
        split_first = 0
        split_second = 0

        if i % 3 == 0:
            split_first = list_line[i].strip().split('\t')  # list of sentences
        j = i + 1
        if j % 3 == 1:
            split_second = list_line[j].strip().split('\t')  # list of label for each word
        list_length.append(len(split_first))

        if command == 'svc':
            for k in range(0, len(split_second)):
                # check the frequency of token before bus service
                if int(split_second[k]) == 1:  # mean bus svc
                    if k > 0:  # bus svc doesn't appear at the first position of sentences
                        # try:  # don't use stemming here
                        #     stem_word = port.stem(connect_token(split_first[k - 1].lower()))  # take the token before
                        # except UnicodeDecodeError:
                        #     stem_word = connect_token(split_first[k - 1].lower())
                        stem_word = connect_token(split_first[k - 1].lower())
                        if is_int(stem_word) is False:
                            text = text + stem_word + ' '

        elif command == 'road':
            k = 0
            while True:
                if k >= len(split_second):
                    break
                else:
                    try:
                        if int(split_second[k]) == 2:  # mean road
                            if k > 0:
                                stem_word = connect_token(split_first[k - 1].lower())
                                if is_int(stem_word) is False:
                                    text = text + stem_word + ' '  # take the word before

                            while True:
                                k += 1
                                if k == len(split_second):
                                    break
                                else:
                                    if int(split_second[k]) != 2:
                                        break
                        else:
                            k += 1
                    except ValueError:
                        k += 1

        elif command == 'busstop':
            k = 0
            while True:
                if k >= len(split_second):
                    break
                else:
                    try:
                        if int(split_second[k]) == 3:  # mean bus stop
                            if k > 0:
                                stem_word = connect_token(split_first[k - 1].lower())
                                if is_int(stem_word) is False:
                                    text = text + stem_word + ' '  # take the word before

                            while True:
                                k += 1
                                if k == len(split_second):
                                    break
                                else:
                                    if int(split_second[k]) != 3:
                                        break
                        else:
                            k += 1
                    except ValueError:
                        k += 1

    fdist = FreqDist()
    tokens = word_tokenize(str(text))
    fdist.update(tokens)
    for value in fdist.most_common(len(fdist)):
        print value[0], '\t', value[1]
        # print value[0]

    print text
Example #39
0
class Vocab(object):
    def __init__(self, tokenizer=None, max_size=None, min_freq=1):
        """Basic Vocabulary object"""

        self.vocab_size = 0
        self.freqdist = FreqDist()
        self.tokenizer = tokenizer

    def update(self, glove_dir, max_size=None, min_freq=1):
        """
        Initialize id2word & word2id based on self.freqdist
        max_size include 4 special tokens
        """

        # {0: '<pad>', 1: '<unk>', 2: '<sos>', 3: '<eos>'}
        self.id2word = {
            PAD_ID: PAD_TOKEN, UNK_ID: UNK_TOKEN,
            SOS_ID: SOS_TOKEN, EOS_ID: EOS_TOKEN
        }
        # {'<pad>': 0, '<unk>': 1, '<sos>': 2, '<eos>': 3}
        self.word2id = defaultdict(lambda: UNK_ID)  # Not in vocab => return UNK
        self.word2id.update({
            PAD_TOKEN: PAD_ID, UNK_TOKEN: UNK_ID,
            SOS_TOKEN: SOS_ID, EOS_TOKEN: EOS_ID
        })
        # self.word2id = {
        #     PAD_TOKEN: PAD_ID, UNK_TOKEN: UNK_ID,
        #     SOS_TOKEN: SOS_ID, EOS_TOKEN: EOS_ID
        # }

        vocab_size = 4
        min_freq = max(min_freq, 1)

        # Reset frequencies of special tokens
        # [...('<eos>', 0), ('<pad>', 0), ('<sos>', 0), ('<unk>', 0)]
        freqdist = self.freqdist.copy()
        special_freqdist = {token: freqdist[token]
                            for token in [PAD_TOKEN, UNK_TOKEN, SOS_TOKEN, EOS_TOKEN]}
        freqdist.subtract(special_freqdist)

        # Sort: by frequency, then alphabetically
        # Ex) freqdist = { 'a': 4,   'b': 5,   'c': 3 }
        #  =>   sorted = [('b', 5), ('a', 4), ('c', 3)]
        sorted_frequency_counter = sorted(freqdist.items(), key=lambda k_v: k_v[0])
        sorted_frequency_counter.sort(key=lambda k_v: k_v[1], reverse=True)

        # Load glove vector
        word_emb_dict = self.get_glove_emb(glove_dir)

        for word, freq in sorted_frequency_counter:

            if freq < min_freq or vocab_size == max_size:
                break
            self.id2word[vocab_size] = word
            self.word2id[word] = vocab_size
            vocab_size += 1

        self.vocab_size = vocab_size


        # Create embedding matrix
        self.embedding_matrix = embedding_matrix = np.zeros((self.vocab_size, 300))

        for word, ind in self.word2id.items():
            if word.lower() in word_emb_dict:
                embedding_matrix[self.word2id[word]] = word_emb_dict[word.lower()]
            else:
                embedding_matrix[self.word2id[word]] = np.random.uniform(-0.25, 0.25, 300)

    def get_glove_emb(self, GLOVE_DIR):
        embeddings_index = {}
        f = open(os.path.join(GLOVE_DIR, 'glove.840B.300d.txt'), 'rb')
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word.decode().lower()] = coefs
        f.close()
        return embeddings_index


    def __len__(self):
        return len(self.id2word)


    def load(self, word2id_path=None, id2word_path=None, word_emb_path=None):
        if word2id_path:
            with open(word2id_path, 'rb') as f:
                word2id = pickle.load(f)
            # Can't pickle lambda function
            self.word2id = defaultdict(lambda: UNK_ID)
            self.word2id.update(word2id)
            self.vocab_size = len(self.word2id)

        if id2word_path:
            with open(id2word_path, 'rb') as f:
                id2word = pickle.load(f)
            self.id2word = id2word
        
        if word_emb_path:
            with open(word_emb_path, 'rb') as f:
                embedding_matrix = pickle.load(f)
            self.embedding_matrix = embedding_matrix

    def add_word(self, word):
        assert isinstance(word, str), 'Input should be str'
        self.freqdist.update([word])

    def add_sentence(self, sentence, tokenized=False):
        if not tokenized:
            sentence = self.tokenizer(sentence)
        for word in sentence:
            self.add_word(word)

    def add_dataframe(self, conversation_df, tokenized=True):
        for conversation in conversation_df:
            for sentence in conversation:
                self.add_sentence(sentence, tokenized=tokenized)

    def pickle(self, word2id_path, id2word_path, word_emb_path):
        with open(word2id_path, 'wb') as f:
            pickle.dump(dict(self.word2id), f)

        with open(id2word_path, 'wb') as f:
            pickle.dump(self.id2word, f)

        with open(word_emb_path, 'wb') as f:
            pickle.dump(self.embedding_matrix, f)

    def to_list(self, list_like):
        """Convert list-like containers to list"""
        if isinstance(list_like, list):
            return list_like

        if isinstance(list_like, Variable):
            return list(to_tensor(list_like).numpy())
        elif isinstance(list_like, Tensor):
            return list(list_like.numpy())

    def id2sent(self, id_list):
        """list of id => list of tokens (Single sentence)"""
        id_list = self.to_list(id_list)
        sentence = []
        for id in id_list:
            word = self.id2word[id]
            if word not in [EOS_TOKEN, SOS_TOKEN, PAD_TOKEN]:
                sentence.append(word)
            if word == EOS_TOKEN:
                break
        return sentence

    def sent2id(self, sentence, var=False):
        """list of tokens => list of id (Single sentence)"""
        id_list = [self.word2id[word] for word in sentence]
        if var:
            id_list = to_var(torch.LongTensor(id_list), eval=True)
        return id_list

    def decode(self, id_list):
        sentence = self.id2sent(id_list)
        return ' '.join(sentence)
Example #40
0
class Vocab(object):
    def __init__(self, tokenizer=None, max_size=None, min_freq=1):
        self.vocab_size = 0
        self.freqdist = FreqDist()
        self.tokenizer = tokenizer
        self.pad_id = PAD_ID

    def update(self, max_size=None, min_freq=1):
        self.id2word = {
            PAD_ID: PAD_TOKEN,
            UNK_ID: UNK_TOKEN,
            SOS_ID: SOS_TOKEN,
            EOS_ID: EOS_TOKEN,
            SEP_ID: SEP_TOKEN,
        }
        self.word2id = defaultdict(
            lambda: UNK_ID)  # Not in vocab => return UNK
        self.word2id.update({
            PAD_TOKEN: PAD_ID,
            UNK_TOKEN: UNK_ID,
            SOS_TOKEN: SOS_ID,
            EOS_TOKEN: EOS_ID,
            SEP_TOKEN: SEP_ID,
        })

        vocab_size = 5
        min_freq = max(min_freq, 1)

        freqdist = self.freqdist.copy()
        special_freqdist = {
            token: freqdist[token]
            for token in
            [PAD_TOKEN, UNK_TOKEN, SOS_TOKEN, EOS_TOKEN, SEP_TOKEN]
        }
        freqdist.subtract(special_freqdist)

        sorted_frequency_counter = sorted(freqdist.items(),
                                          key=lambda k_v: k_v[0])
        sorted_frequency_counter.sort(key=lambda k_v: k_v[1], reverse=True)

        for word, freq in sorted_frequency_counter:
            if freq < min_freq or vocab_size == max_size:
                break
            self.id2word[vocab_size] = word
            self.word2id[word] = vocab_size
            vocab_size += 1

        self.vocab_size = vocab_size

    def __len__(self):
        return len(self.id2word)

    def load(self, word2id_path=None, id2word_path=None, ptb=False):
        if word2id_path:
            with open(word2id_path, 'rb') as f:
                word2id = pickle.load(f)
            self.word2id = defaultdict(lambda: UNK_ID)
            self.word2id.update(word2id)
            self.vocab_size = len(self.word2id)

        if id2word_path:
            with open(id2word_path, 'rb') as f:
                id2word = pickle.load(f)
            self.id2word = id2word

        if ptb:
            self.word2id['<sep>'] = self.vocab_size
            self.id2word[self.vocab_size] = '<sep>'
            self.vocab_size += 1

    def add_word(self, word):
        assert isinstance(word, str), 'Input should be str'
        self.freqdist.update([word])

    def add_sentence(self, sentence, tokenized=False):
        if not tokenized:
            sentence = self.tokenizer(sentence)
        for word in sentence:
            self.add_word(word)

    def add_dataframe(self, conversation_df, tokenized=True):
        for conversation in conversation_df:
            for sentence in conversation:
                self.add_sentence(sentence, tokenized=tokenized)

    def pickle(self, word2id_path, id2word_path):
        with open(word2id_path, 'wb') as f:
            pickle.dump(dict(self.word2id), f)

        with open(id2word_path, 'wb') as f:
            pickle.dump(self.id2word, f)

    def to_list(self, list_like):
        if isinstance(list_like, list):
            return list_like

        if isinstance(list_like, Variable):
            return list(to_tensor(list_like).numpy())
        elif isinstance(list_like, Tensor):
            return list(list_like.numpy())

    def id2sent(self, id_list):
        id_list = self.to_list(id_list)
        sentence = []
        for id in id_list:
            word = self.id2word[id]
            if word not in [EOS_TOKEN, SOS_TOKEN, PAD_TOKEN]:
                sentence.append(word)
            if word == EOS_TOKEN:
                break
        return sentence

    def sent2id(self, sentence, var=False):
        id_list = [self.word2id[word] for word in sentence]
        if var:
            id_list = to_var(torch.LongTensor(id_list), eval=True)
        return id_list

    def decode(self, id_list):
        sentence = self.id2sent(id_list)
        return ' '.join(sentence)
    in_str = sys.stdin.read(BUF_SIZE)
    rest = ''

    read_count = 0

    while (rest + in_str).strip() != '':
        read_count += 1

        if read_count % 100 == 0:
            sys.stderr.write('.')
            sys.stderr.flush()

        tokens = (rest + in_str).split()
        rest = tokens.pop()

        if not tokens:
            vocab.update(rest)
            break
        else:
            vocab.update(tokens)

        in_str = sys.stdin.read(BUF_SIZE)

    print

    for i in [1000, 2000, 5000, 10000, 20000, 50000, 100000, 200000, 500000, 1000000]:
        if i > len(vocab.values()):
            break

        print "vocab size %7d - cutoff = %d" % (i, vocab.values()[i])
Example #42
0
class BiWordExtractor:

    def __init__(self, pickle_file):
        self._statuses = pickle.load(open(pickle_file, 'rb'))
        self._averages = dict()
        self._gender_stats = dict()
        self.fdistneuro = FreqDist()
        self.fdistnonneuro = FreqDist()
        self.highneuro = defaultdict()
        self.highnonneuro =defaultdict()
        

    """
    Processes statuses. (For information on how the different data structures
    are set up, look at the comments for the getters.)
    """

    
    def wordprocess(self):
        lengths = dict()
        row = 0
        for status in self._statuses[1:]:
            row +=1
            print row
            user = status[0]
            
            filtered_status = status[1].translate(string.maketrans("",""), string.punctuation)
            
            tokens = pattern_split.split(filtered_status.lower())
            
            filtered_tokens = [w for w in tokens if w not in stopwordslist and w not in filterlist]
                        
            bitokens = nltk.bigrams(filtered_tokens)
            
            if status[5] == '+':
                self.fdistneuro.update(bitokens) 
            elif status[5] == '-':
                self.fdistnonneuro.update(bitokens)            
            
                
                

                

        

    

    def neuro_word_frequency(self):
        vocneuro= self.fdistneuro.keys()
        highvocneuro = vocneuro [:300]
        return highvocneuro
        
   

   

    def highneuro_word_frequency(self):
        for w in self.neuro_word_frequency():
            if self.fdistneuro[w]>= 5:
                self.highneuro[w] =self.fdistneuro[w]
            
        print self.highneuro.items()
        print self.highneuro.keys()
        return self.highneuro.keys()
Example #43
0
class PosNgram:
    def __init__(self, deg=1):
        self.order = deg
        self.__sentence = ""

        # storing tokens and frequency
        self.train_data = FreqDist()
        self.test_sents = None

        # to prevent from illegral argument
        if deg < 1:
            self.order = 1

    def poses2tokens(self, pos_terms, include_freq=False, default_dict=None):
        """
        # The token_terms must be the element of ngram_model
        # whose order is 1 smaller than that of the current one.
        """
        if default_dict is None:
            default_dict = self.train_data

        for (tokens, poses), freq in default_dict.items():
            if pos_terms == poses:
                yield tokens if\
                        not include_freq\
                        else (tokens, freq)

    def tokens2poses(self, token_terms, include_freq=False, default_dict=None):
        """
        # The token_terms must be the element of ngram_model
        # whose order is 1 smaller than that of the current one.
        """
        if default_dict is None:
            default_dict = self.train_data

        for (tokens, poses), freq in default_dict.items():
            if token_terms == tokens:
                yield poses if\
                        not include_freq\
                        else (poses, freq)

    def pre_process(self, file_id, training_size=90):

        start_processing = time.time()
        self.train_data = FreqDist()

        sents = gutenberg.sents(file_id)
        t_size = floor((training_size / 100) * len(sents))

        train_sents = sents[:t_size]
        self.test_sents = sents[t_size:]

        p_title = "file_id = <{}>, ngram's order = {}, split_ratio = {}-{}"
        print(
            p_title.format(file_id, self.order, training_size,
                           100 - training_size))
        with ICB('Processing...', max=len(train_sents),
                 suffix='%(percent)d%%') as bar:

            for sent in train_sents:
                bar.next()
                self.__sentence = " ".join(sent)
                self.train_data.update(self._token_pos_pairs)

        print('dict_size = {}'.format(self.train_data.B()))
        print("loading time = {}".format(time.time() - start_processing))

    def _is_subcontent(self, w1, w2):
        assert len(w1) <= len(w2)
        w1 = list(w1)
        w2 = list(w2)
        for w in w1:
            if w not in w2:
                return False
            w2.remove(w)
        return True

    def fetch_if(self, cond, term, pos_is_target=True, include_pair=False):

        tmp_freq_dist = FreqDist()

        conditions = {
            ng_prefix: ["pos[:-1] == term", "token[:-1] == term"],
            ng_suffix:
            ["pos[-len(term):] == term", "token[-len(term):] == term"],
            ng_contain: [
                "self._is_subcontent(term, pos)",
                "self._is_subcontent(term , token)"
            ],
            ng_equal: ["pos == term", "token == term"]
        }

        if cond not in conditions:
            cond = prefix

        # Fetching Choice Configuration
        p_key, t_key = "", ""
        if include_pair:
            p_key = "(pos, token)"
            t_key = "(token, pos)"
        else:
            p_key = "pos"
            t_key = "token"
        cmp_p = compile(p_key, '<string>', 'eval')
        cmp_t = compile(t_key, '<string>', 'eval')

        if pos_is_target:
            cmp_cond = compile(conditions[cond][0], '<string>', 'eval')
            for (token, pos), freq in self.train_data.items():
                if eval(cmp_cond):
                    tmp_freq_dist.update({eval(cmp_p): freq})
        else:
            cmp_cond = compile(conditions[cond][1], '<string>', 'eval')
            for (token, pos), freq in self.train_data.items():
                if eval(cmp_cond):
                    tmp_freq_dist.update({eval(cmp_t): freq})

        return tmp_freq_dist

    @property
    def _token_pos_pairs(self):
        """
        This function maps terms to POS
        (The previous version's name was phi1)
        """
        for elems in self._ngram_tokens_pos:
            poses = [elem[1] for elem in elems]

            tokens = [elem[0] for elem in elems]
            yield (tuple(tokens), tuple(poses))

    @property
    def _sent2pos_tag(self):
        sent = self.__sentence
        tokens = word_tokenize(sent)
        return pos_tag(tokens)

    @property
    def _ngram_tokens_pos(self):
        # this returns the tuples of token pos pair
        return ngrams(self._sent2pos_tag, self.order)
Example #44
0
def load_all_dic_token_bef_road_busstop(list_line, command):
    # load all the word of token before and after labeling, note that we do not consider if this token is a
    # number. In fact, we only consider if token contain all characters
    # Using only for "road" and "busstop"

    text = ''
    for i in range(0, len(list_line), 3):
        split_first = 0
        split_second = 0

        if i % 3 == 0:
            split_first = list_line[i].strip().split('\t')
        j = i + 1
        if j % 3 == 1:
            split_second = list_line[j].strip().split('\t')

        k = 0
        while True:
            if k >= len(split_second):
                break

            if command == 'road':  # get the token before labeling for road
                try:
                    if int(split_second[k]) == 2:  # detect this is a road => get the token before it
                        if k > 0:
                            token_bef = split_first[k - 1].lower()
                            if token_isAllCharacter(token_bef) is True:
                                text = text + connect_token(token_bef) + ' '  # take the word before

                        while True:
                            k += 1
                            if k == len(split_second):
                                break
                            else:
                                if int(split_second[k]) != 2:
                                    break
                    else:
                        k += 1
                except ValueError:
                    k += 1

            if command == 'busstop':  # get the token before labeling for road
                try:
                    if int(split_second[k]) == 3:  # detect this is a road => get the token before it
                        if k > 0:
                            token_bef = split_first[k - 1].lower()
                            if token_isAllCharacter(token_bef) is True:
                                text = text + connect_token(token_bef) + ' '  # take the word before

                        while True:
                            k += 1
                            if k == len(split_second):
                                break
                            else:
                                if int(split_second[k]) != 3:
                                    break
                    else:
                        k += 1
                except ValueError:
                    k += 1

    fdist = FreqDist()
    tokens = word_tokenize(str(text))
    fdist.update(tokens)
    for value in fdist.most_common(len(fdist)):
        print value[0], '\t', value[1]

    list_return = list()
    for value in fdist.most_common(len(fdist)):
        list_return.append(value[0])
        print value[0]
    print len(fdist)
    return list_return
Example #45
0
class WordExtractor:

    def __init__(self, pickle_file):
        self._statuses = pickle.load(open(pickle_file, 'rb'))
        self._averages = dict()
        self._gender_stats = dict()
        self.fdistneuro = FreqDist()
        self.fdistnonneuro = FreqDist()
        self.highneuro = defaultdict()
        self.highnonneuro =defaultdict()
        
        self.f = defaultdict(float)
        self.g = defaultdict(float)
        self.wordlist = []
        
        

    """
    Processes statuses. (For information on how the different data structures
    are set up, look at the comments for the getters.)
    """

    
    def wordprocess(self):
        lengths = dict()
        line = 0
        for status in self._statuses[1:]:
            line +=1
            print line
            user = status[0]
            filtered_status = status[1].translate(string.maketrans("",""), string.punctuation)
            
            tokens = pattern_split.split(filtered_status.lower())
            
            
            # filter out stopwords and emoticons
            filtered_tokens = [w for w in tokens if w not in stopwordslist and w not in filterlist]
            
            if status[5] == '+':
                self.fdistneuro.update(filtered_tokens) 
            elif status[5] == '-':
                self.fdistnonneuro.update(filtered_tokens)            
            
     
                

                

       

    
    #returns most frequently used words by neurotic person
    def neuro_word_frequency(self):
        vocneuro= self.fdistneuro.keys()
        highvocneuro = vocneuro [:500]
        return highvocneuro
        
    #returns most frequently used words by non-neurotic person    
    def nonneuro_word_frequency(self):
        
        vocnonneuro = self.fdistnonneuro.keys()
        highvocnonneuro = vocnonneuro [:500]
        return highvocnonneuro

    

    def highneuro_word_frequency(self):
        for w in self.neuro_word_frequency():
            self.highneuro[w] =self.fdistneuro[w]
        return self.highneuro.items()
    
    
    
    def highnonneuro_word_frequency(self):
        for w in self.nonneuro_word_frequency():
            self.highnonneuro[w] =self.fdistnonneuro[w]
        return self.highnonneuro.items()
    
    def select_word(self):
        nntn = float(184563/1780098)
        ntnn = float(1780098/184563)
         
        

        for w in self.highneuro.keys():
            
            if w in self.highnonneuro.keys():
                self.f[w]= int(self.highneuro[w]-self.highnonneuro[w]*nntn)
                
                     
                

        print self.f.items()

        print "Start calculating non-neurotic words"
        for w in self.highnonneuro.keys():
            if w in self.highneuro.keys():
                self.g[w] = int(self.highnonneuro[w]-self.highneuro[w]*ntnn)
        
            else:
                print "False for %s" %(w)
                

        print self.g.items()
        
        for w in self.f.keys():
            if w in self.g.keys():
                if self.f[w]>=2000 and self.g[w]<=500:
                    self.wordlist.append(w)
    
        print "Here is the wordlist"
        print self.wordlist
        # return a list of words used relatively heavily by neurotic persons
        return self.wordlist
Example #46
0
    tokens=[re.sub(r'[^A-Za-z]+','',token) for token in tokens]
    tokens=[wn.lemmatize(token) for token in tokens]
    return tokens  


text_tokens=[]
for item in sentences[0:1000]:
    tokens = preprocess_text(item)
    temp = " ".join(tokens)
    text_tokens.append(temp)    

from nltk import FreqDist

word_dist = FreqDist()
for s in text_tokens:
    word_dist.update(s.split())

########################################################################################
from nltk.util import ngrams
from collections import Counter


text=''
for sent in text_tokens:
    text=text+sent

tokens=word_tokenize(text)    
bigrams = ngrams(tokens,2)

bigram_dict=dict(Counter(bigrams))
Example #47
0
def load_all_dic_token_bef_aft_svc(list_line, command):
    # loading all token before and after for bus service
    # Using only for bus service, because for bus service we not only focus on the token before, but also the token
    # after labeling
    text = ''
    for i in range(0, len(list_line), 3):
        split_first = 0
        split_second = 0

        if i % 3 == 0:
            split_first = list_line[i].strip().split('\t')
        j = i + 1
        if j % 3 == 1:
            split_second = list_line[j].strip().split('\t')

        k = 0
        while True:
            if k >= len(split_second):
                break

            if command == 'bef_svc':  # get the token before labeling for bus svc
                try:
                    if int(split_second[k]) == 1:  # detect this is a svc => get the token before it
                        if k > 0:
                            token_bef = split_first[k - 1].lower()
                            if token_isAllCharacter(token_bef) is True:
                                text = text + connect_token(token_bef) + ' '  # take the word before

                        while True:
                            k += 1
                            if k == len(split_second):
                                break
                            else:
                                if int(split_second[k]) != 1:
                                    break
                    else:
                        k += 1
                except ValueError:
                    k += 1

            if command == 'aft_svc':
                try:
                    if int(split_second[k]) == 1:  # take bus svc
                        while True:
                            k += 1
                            if k == len(split_second):
                                break
                            else:
                                if int(split_second[k]) != 1:
                                    break
                        if k < len(split_second) - 1:
                            # take the token after the label
                            token_aft = split_first[k].lower()
                            if token_isAllCharacter(token_aft) is True:
                                text = text + connect_token(token_aft) + ' '
                    else:
                        k += 1

                except ValueError:
                    k += 1

    fdist = FreqDist()
    tokens = word_tokenize(str(text))
    fdist.update(tokens)
    for value in fdist.most_common(len(fdist)):
        print value[0], '\t', value[1]

    list_return = list()
    for value in fdist.most_common(len(fdist)):
        list_return.append(value[0])
        print value[0]
    print len(fdist)
    return list_return
Example #48
0
class Vocab(object):
    def __init__(self, tokenizer=None, max_size=None, min_freq=1):
        """Basic Vocabulary object"""

        self.vocab_size = 0
        self.freqdist = FreqDist()
        self.tokenizer = tokenizer

    def update(self, max_size=None, min_freq=1):
        """
        Initialize id2word & word2id based on self.freqdist
        max_size include 4 special tokens
        """

        # {0: '<pad>', 1: '<unk>', 2: '<sos>', 3: '<eos>'}
        self.id2word = {
            PAD_ID: PAD_TOKEN,
            UNK_ID: UNK_TOKEN,
            SOS_ID: SOS_TOKEN,
            EOS_ID: EOS_TOKEN
        }
        # {'<pad>': 0, '<unk>': 1, '<sos>': 2, '<eos>': 3}
        self.word2id = defaultdict(
            lambda: UNK_ID)  # Not in vocab => return UNK
        self.word2id.update({
            PAD_TOKEN: PAD_ID,
            UNK_TOKEN: UNK_ID,
            SOS_TOKEN: SOS_ID,
            EOS_TOKEN: EOS_ID
        })
        # self.word2id = {
        #     PAD_TOKEN: PAD_ID, UNK_TOKEN: UNK_ID,
        #     SOS_TOKEN: SOS_ID, EOS_TOKEN: EOS_ID
        # }

        vocab_size = 4
        min_freq = max(min_freq, 1)

        # Reset frequencies of special tokens
        # [...('<eos>', 0), ('<pad>', 0), ('<sos>', 0), ('<unk>', 0)]
        freqdist = self.freqdist.copy()
        special_freqdist = {
            token: freqdist[token]
            for token in [PAD_TOKEN, UNK_TOKEN, SOS_TOKEN, EOS_TOKEN]
        }
        freqdist.subtract(special_freqdist)

        # Sort: by frequency, then alphabetically
        # Ex) freqdist = { 'a': 4,   'b': 5,   'c': 3 }
        #  =>   sorted = [('b', 5), ('a', 4), ('c', 3)]
        sorted_frequency_counter = sorted(freqdist.items(),
                                          key=lambda k_v: k_v[0])
        sorted_frequency_counter.sort(key=lambda k_v: k_v[1], reverse=True)

        for word, freq in sorted_frequency_counter:

            if freq < min_freq or vocab_size == max_size:
                break
            self.id2word[vocab_size] = word
            self.word2id[word] = vocab_size
            vocab_size += 1

        self.vocab_size = vocab_size

    def __len__(self):
        return len(self.id2word)

    def load(self, word2id_path=None, id2word_path=None):
        if word2id_path:
            with open(str(word2id_path), 'rb') as f:
                word2id = pickle.load(f)
            # Can't pickle lambda function
            self.word2id = defaultdict(lambda: UNK_ID)
            self.word2id.update(word2id)
            self.vocab_size = len(self.word2id)

        if id2word_path:
            with open(str(id2word_path), 'rb') as f:
                id2word = pickle.load(f)
            self.id2word = id2word

    def add_word(self, word):
        assert isinstance(word, str), 'Input should be str'
        self.freqdist.update([word])

    def add_sentence(self, sentence, tokenized=False):
        if not tokenized:
            sentence = self.tokenizer(sentence)
        for word in sentence:
            self.add_word(word)

    def add_dataframe(self, conversation_df, tokenized=True):
        for conversation in conversation_df:
            for sentence in conversation:
                self.add_sentence(sentence, tokenized=tokenized)

    def pickle(self, word2id_path, id2word_path):
        with open(str(word2id_path), 'wb') as f:
            pickle.dump(dict(self.word2id), f)

        with open(str(id2word_path), 'wb') as f:
            pickle.dump(self.id2word, f)

    def to_list(self, list_like):
        """Convert list-like containers to list"""
        if isinstance(list_like, list):
            return list_like

        if isinstance(list_like, Variable):
            return list(to_tensor(list_like).numpy())
        elif isinstance(list_like, Tensor):
            return list(list_like.numpy())

    def id2sent(self, id_list):
        """list of id => list of tokens (Single sentence)"""
        id_list = self.to_list(id_list)
        sentence = []
        for id in id_list:
            word = self.id2word[id]
            if word not in [EOS_TOKEN, SOS_TOKEN, PAD_TOKEN]:
                sentence.append(word)
            if word == EOS_TOKEN:
                break
        return sentence

    def sent2id(self, sentence, var=False):
        """list of tokens => list of id (Single sentence)"""
        id_list = [self.word2id[word] for word in sentence]
        if var:
            id_list = to_var(torch.LongTensor(id_list), eval=True)
        return id_list

    def decode(self, id_list):
        sentence = self.id2sent(id_list)
        return ' '.join(sentence)
Example #49
0
    if word_limit:
        logging.info('Word limit %d' % word_limit)

    order = parse_ngram_order(opts.ngram_order)

    logging.info('Char n-gram order (%d, %d)' % order)
    cutoff = opts.min_count

    corpus = SublexicalizedCorpus(WikiCorpus(dump_fn, dictionary=Dictionary()), order=order, word_limit=word_limit)

    tf = FreqDist()
    df = FreqDist()

    n_docs = 0

    for text in corpus:
        n_docs += 1

        tf.update(text)
        df.update(set(text))

    print "###TOTAL###\t%d\t%d" % (tf.N(), n_docs)

    for token, freq in tf.items():
        if freq < cutoff:
            break

        print "%s\t%d\t%d\t%.6f" % (token, freq, df[token], math.log(float(n_docs)/df[token]))