def text_to_dict(docs, metric):
    """ Create dictionaries of term frequencies based on documents

    Metric must be either :attr:`FrequencyMetrics.TF` or :attr:`FrequencyMetrics.TF_IDF`.
    """
    doc_freqs = FreqDist() # Distribution over how many documents each word appear in.
    tf_dists = [] # List of TF distributions per document

    # Create freq_dist for each document
    for doc in docs:
        doc = preprocess.preprocess_text(doc)
        fd = FreqDist()
        for word in doc: fd.inc(word)
        doc_freqs.update(fd.samples())
        tf_dists.append(fd)


    num_docs = len(docs)
    # Build dictionaries
    dicts = []
    for i, fd in enumerate(tf_dists):
        if i%100==0: print '    dict',str(i)+'/'+str(len(tf_dists))
        d = {}
        if metric == FrequencyMetrics.TF:
            for word in fd.samples():
                d[word] = fd.freq(word)
        elif metric == FrequencyMetrics.TF_IDF:
            for word in fd.samples():
                d[word] = fd.freq(word) * math.log(float(num_docs)/doc_freqs[word])
        else:
            raise ValueError("No such feature type: %s" % feature_type);
        dicts.append(d)
    return dicts
Beispiel #2
0
def text_to_dict(docs, metric):
    """ Create dictionaries of term frequencies based on documents

    Metric must be either :attr:`FrequencyMetrics.TF` or :attr:`FrequencyMetrics.TF_IDF`.
    """
    doc_freqs = FreqDist(
    )  # Distribution over how many documents each word appear in.
    tf_dists = []  # List of TF distributions per document

    # Create freq_dist for each document
    for doc in docs:
        doc = preprocess.preprocess_text(doc)
        fd = FreqDist()
        for word in doc:
            fd.inc(word)
        doc_freqs.update(fd.samples())
        tf_dists.append(fd)

    num_docs = len(docs)
    # Build dictionaries
    dicts = []
    for i, fd in enumerate(tf_dists):
        if i % 100 == 0: print '    dict', str(i) + '/' + str(len(tf_dists))
        d = {}
        if metric == FrequencyMetrics.TF:
            for word in fd.samples():
                d[word] = fd.freq(word)
        elif metric == FrequencyMetrics.TF_IDF:
            for word in fd.samples():
                d[word] = fd.freq(word) * math.log(
                    float(num_docs) / doc_freqs[word])
        else:
            raise ValueError("No such feature type: %s" % feature_type)
        dicts.append(d)
    return dicts
def wrd_ngram_stats(texts, corpus, order, include_lower=False):
    
    all_wd_ngrams = FreqDist()
    text_wrd_ngrams = []
    
    for text in texts:
        
        if not text.endswith(".txt"):
            continue
        
        wrd_tokens = corpus.words(text)
        empty = len(corpus.raw(text)) == 0
        
        # One freq. dist per n
        text_ngrams = []
        for _ in range(order):
            text_ngrams.append(FreqDist())

        if not empty:
            lower_wrds = [w.lower() for w in wrd_tokens if w.isalnum()]
            
            if include_lower:
                for n in range(1, order+1):
                    wd_ng = ngrams(lower_wrds, n)
                    text_ngrams[n-1].update(wd_ng)
                    if n == order:
                        all_wd_ngrams.update(wd_ng)
            else:
                wd_ng = ngrams(lower_wrds, order)
                text_ngrams[order-1].update(wd_ng)
                all_wd_ngrams.update(wd_ng)
                
        text_wrd_ngrams.append(text_ngrams)
            
    return all_wd_ngrams, text_wrd_ngrams
Beispiel #4
0
def evaluate_html(content, html_conf):
    fdist = FreqDist()
    if html_conf['usehtml'] == False:
        logging.info('Discarding HTML tags')
        return fdist
 
    logging.info("\tEvaluating HTML")
     
    # try with TITLE tag
    titles = re.findall("<title>[A-Za-z0-9 ]+</title>", content)
    for title in titles:
        root = etree.fromstring(title)
        words_list = nltk.word_tokenize(re.sub('[^A-Za-z0-9 ]', ' ', root.text))
        terms_list = [ x for x in words_list if x.lower() not in stopwords.words('english')]
        stems = steming(terms_list)

        for i in range(html_conf['title']):
            fdist.update(stems)

    # try with H1 tag
    headers = re.findall("<h1>[A-Za-z0-9 ]+</h1>", content)
    for header in headers:
        root = etree.fromstring(header)
        words_list = nltk.word_tokenize(re.sub('[^A-Za-z0-9 ]', ' ', root.text))
        terms_list = [ x for x in words_list if x.lower() not in stopwords.words('english')]
        stems = steming(terms_list)

        for i in range(html_conf['h1']):
            fdist.update(stems)

    return fdist
Beispiel #5
0
def buildCategoryDictionary(category):
    tweetList = twitter_fetch.get_tweets_text(classn=category)
    freq = FreqDist()
    for tweet in tweetList:
        freq.update(word for word in tokenizeTweet(tweet))
    saveDictionaryToFile(freq, category + categoryDictFilePath)
    return freq
def high_words(posids, negids, cutoff, score_fn=BigramAssocMeasures.chi_sq, min_score=5):

    word_fd = FreqDist()
    label_word_fd = ConditionalFreqDist()
    pos = 0
    neg = 0
    for review in posids:
        pos += 1
        if (pos != cutoff):
            for word in review['text'].split(' '):
                word_fd.update(token_helpers.tokenize_simple(word))
                label_word_fd['pos'].update(token_helpers.tokenize_simple(word))
 
    for review in negids:
        neg += 1
        if (neg != cutoff):
            for word in review['text'].split(' '):
                word_fd.update(token_helpers.tokenize_simple(word))
                label_word_fd['neg'].update(token_helpers.tokenize_simple(word))
    
    pos_word_count = label_word_fd['pos'].N()
    neg_word_count = label_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.items():
        pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score
    best = sorted(word_scores.items(), key=itemgetter(1), reverse=True)[:10000]
    bestwords = set([w for w, s in best])
    return bestwords
    
    """
Beispiel #7
0
def buildCategoryDictionary(category):
    tweetList = twitter_fetch.get_tweets_text(classn=category)
    freq = FreqDist()
    for tweet in tweetList:
        freq.update(word for word in tokenizeTweet(tweet))
    saveDictionaryToFile(freq, category + categoryDictFilePath)
    return freq
def text_to_vector(docs, metric):
    """ Create frequency based feature-vector from text

    Metric must be either :attr:`FrequencyMetrics.TF` or :attr:`FrequencyMetrics.TF_IDF`.
    """
    doc_freqs = FreqDist() # Distribution over how many documents each word appear in.
    tf_dists = [] # List of TF distributions per document

    # Create freq_dist for each document
    for doc in docs:
        doc = preprocess.preprocess_text(doc)
        fd = FreqDist()
        for word in doc: fd.inc(word)
        doc_freqs.update(fd.samples())
        tf_dists.append(fd)


    all_tokens = doc_freqs.keys()
    num_docs = len(docs)
    num_features = len(all_tokens)


    # Build feature x document matrix
    matrix = np.zeros((num_features, num_docs))
    for i, fd in enumerate(tf_dists):
        if metric == FrequencyMetrics.TF:
            v = [fd.freq(word) for word in all_tokens]
        elif metric == FrequencyMetrics.TF_IDF:
            v = [fd.freq(word) * math.log(float(num_docs)/doc_freqs[word]) for word in all_tokens]
        else:
            raise ValueError("No such feature type: %s" % feature_type);
        matrix[:,i] = v

    return matrix
Beispiel #9
0
def tokenize_data(sentences):
    # tokenize the dataset
    fdist = FreqDist()
    tokenized_sents = []
    for sentence in sentences:
        tokenized_sent = [w.lower() for w in word_tokenize(sentence)]
        tokenized_sents.append(tokenized_sent)
        fdist.update(tokenized_sent)

    # print("Number of word types in the tokenized data: ", len(fdist))
    return tokenized_sents
Beispiel #10
0
def generate_freq_dist(samples):
    fdist = FreqDist()
    lemmatizer = WordNetLemmatizer()

    for sample in samples:
        temp = FreqDist([
            lemmatizer.lemmatize(word, "v")
            for sent in sent_tokenize(sample.text)
            for word in word_tokenize(sent)
        ])
        fdist.update(temp)
    return fdist
Beispiel #11
0
def reduce_text(t1, t2):
    words = FreqDist(t1[0])
    words.update(t2[0])

    try:
        bigrams = FreqDist(t1[1])
        bigrams.update(t2[1])
    except:
        logger.error('problem in reducing..')
        logger.error('t1: %s' % str(t1))
        logger.error('t2: %s' % str(t2))

    return words, bigrams
def reduce_text(t1, t2):
    words = FreqDist(t1[0])
    words.update(t2[0])

    try:
        bigrams = FreqDist(t1[1])
        bigrams.update(t2[1])
    except:
        logger.error('problem in reducing..')
        logger.error('t1: %s' % str(t1))
        logger.error('t2: %s' % str(t2))
    
    return words, bigrams
Beispiel #13
0
def sentence_ngrams(sentence):
    print(str('pid:{} ||'.format(os.getpid())),
          time.strftime("%y-%m-%d_%H:%M:%S"))
    sentence = sentence.strip()
    if not sentence:
        raise ValueError('Empty sentence!')

    words = sentence.split(' ')
    ngrams_bag = FreqDist()
    for i in range(4):
        ngrams_bag.update(ngrams(words, i + 1))

    return ngrams_bag
Beispiel #14
0
def cnc(phrase_lists,
        c_value_threshold=0,
        include_unigrams=False,
        weight_by_length=True):
    """given a list of phrases, run the cnc algorithm and return a dictionary of word, c-value (ranking) pairs"""
    frequency_dists_by_length = {}
    for phrase in phrase_lists:
        l = len(phrase)
        if l not in frequency_dists_by_length:
            frequency_dists_by_length[l] = FreqDist()
        frequency_dists_by_length[l].inc(tuple(phrase))

    # word -> C-value(word)
    phrase_scores = {}

    # word -> num occurrences(word)
    phrase_frequencies = FreqDist()

    # word -> (t(word), c(word))
    sub_phrase_scores = {}

    # traverse from longest phrases to shortest
    for length, frequency_dist in sorted(frequency_dists_by_length.items(), \
                                         key=lambda pair: pair[0], reverse=True):
        # update global frequency counts with all counts of this length
        phrase_frequencies.update(frequency_dist)
        # within each phrase length, traverse from most common phrases to least
        for phrase, frequency in frequency_dist.iteritems():
            if phrase in sub_phrase_scores:
                t, c = sub_phrase_scores[phrase]
                subtractive = 1.0 / c * t
            else:
                subtractive = 0
            if weight_by_length:
                if include_unigrams:
                    weight = log(length + 1, 2)
                else:
                    weight = log(length, 2)
            else:
                weight = 1
            c_value = weight * (frequency - subtractive)
            if c_value >= c_value_threshold:
                phrase_scores[phrase] = c_value
                for sub_phrase in utils.sub_lists(phrase):
                    if sub_phrase in sub_phrase_scores:
                        t, c = sub_phrase_scores[sub_phrase]
                    else:
                        t, c = 0, 0
                    sub_phrase_scores[sub_phrase] = t + frequency, c + 1
    return phrase_scores, phrase_frequencies
Beispiel #15
0
async def gobaby(urls):
    """
    Подготовка-запуск асинхронного запроса по урлам, формирование итогового словаря слов
    :param urls:
    :return:
    """
    futures = [get_content(url) for url in
               urls]  # созадниае списка футур (функций, которые будут выполнены в асинхронном режиме)
    done, _ = await asyncio.wait(futures)  # Запуск ФУТУР
    result_dict = FreqDist()  # Словарь в котором будут собираться результаты
    for future in done:  # Если футура выполнена
        try:
            result_dict.update(give_me_my_dict(future.result()))
        except Exception as e:
            print('Ошибка вот такая', e)
    create_cloud(result_dict) #Создание облага слов
def analyze(data, out_dir):
    summary = {}
    freq = FreqDist()
    sentence_length = defaultdict(list)
    year_freq_dist = defaultdict(FreqDist)
    year_dist = defaultdict(int)
    year_month_dist = defaultdict(int)
    year_quarter_dist = defaultdict(int)

    has_date = no_date = sentences = words = 0

    for year, date_str, title, text in data:
        date = parsedate(date_str)
        logger.debug('%s -> %s' % (date_str, str(date)))
        freq.update(ngram_phrases(text,3))
        if date:
            # Since can't use strftime for years before 1900, we need to use isoformat
            year_str = date.isoformat()[:4]
            year_mo_str = date.isoformat()[:7]
            has_date += 1
        else:
            no_date += 1
            year_mo_str = ''

        if year_str:
            year_range = get_year_range(year_str)
            sentence_length[ year_range ].extend( sentence_lengths(text) )
            year_freq_dist[ year_range ].update( ngram_phrases(text,3) )
            year_dist[year] += 1

        if year_mo_str:
            year_month_dist[year_mo_str] += 1
            year_quarter_dist[ year_quarter(year_mo_str) ] += 1

        sentences += count_sentences(text)
        words += count_words(text)

    logger.debug('Documents with a valid date: %d Documents without a valid date: %d' % (has_date, no_date))
    logger.debug('Total # Sentences: %d' % sentences)
    logger.debug('Total $ Words: %d' % words)

    generate_dict_csv(['year', 'cnt'], year_dist, os.path.join(out_dir, 'year-data.csv'))
    generate_dict_csv(['yearmo', 'cnt'], year_month_dist, os.path.join(out_dir, 'year-mo-data.csv'))
    generate_dict_csv(['yearq', 'cnt'], year_quarter_dist, os.path.join(out_dir, 'year-quarter-data.csv'))
    generate_stream_js(year_freq_dist, os.path.join(out_dir, 'stream-data.json'))
    generate_cloud_csv(year_freq_dist, os.path.join(out_dir, 'year-phrase-data.csv'))
    generate_sentence_length_csv(sentence_length, os.path.join(out_dir, 'data-sentence-lengths.csv'))
Beispiel #17
0
def cnc(phrase_lists, c_value_threshold=0, include_unigrams=False, weight_by_length=True):
    """given a list of phrases, run the cnc algorithm and return a dictionary of word, c-value (ranking) pairs"""
    frequency_dists_by_length = {}
    for phrase in phrase_lists:
        l = len(phrase)
        if l not in frequency_dists_by_length:
            frequency_dists_by_length[l] = FreqDist()
        frequency_dists_by_length[l].inc(tuple(phrase))

    # word -> C-value(word)
    phrase_scores = {}

    # word -> num occurrences(word)
    phrase_frequencies = FreqDist()

    # word -> (t(word), c(word))
    sub_phrase_scores = {}

    # traverse from longest phrases to shortest
    for length, frequency_dist in sorted(frequency_dists_by_length.items(), key=lambda pair: pair[0], reverse=True):
        # update global frequency counts with all counts of this length
        phrase_frequencies.update(frequency_dist)
        # within each phrase length, traverse from most common phrases to least
        for phrase, frequency in frequency_dist.iteritems():
            if phrase in sub_phrase_scores:
                t, c = sub_phrase_scores[phrase]
                subtractive = 1.0 / c * t
            else:
                subtractive = 0
            if weight_by_length:
                if include_unigrams:
                    weight = log(length + 1, 2)
                else:
                    weight = log(length, 2)
            else:
                weight = 1
            c_value = weight * (frequency - subtractive)
            if c_value >= c_value_threshold:
                phrase_scores[phrase] = c_value
                for sub_phrase in utils.sub_lists(phrase):
                    if sub_phrase in sub_phrase_scores:
                        t, c = sub_phrase_scores[sub_phrase]
                    else:
                        t, c = 0, 0
                    sub_phrase_scores[sub_phrase] = t + frequency, c + 1
    return phrase_scores, phrase_frequencies
Beispiel #18
0
def updateCategoryDictionary(category):
    tweetList = twitter_fetch.get_new_tweets(classn=category)
    freq = FreqDist()
    tmpDict = FreqDist()

    for tweet in tweetList:
        freq.update(word for word in tokenizeTweet(tweet))

    try:
        oldDict = readDictionaryFromFile(category + categoryDictFilePath)
    except:
        newDict = buildCategoryDictionary(category)
        return newDict

    oldDict.update(freq)
    saveDictionaryToFile(oldDict, category + categoryDictFilePath)
    return oldDict
Beispiel #19
0
def generate_lookup(ngrams: List[List[str]]):
    fdist = FreqDist()

    for entry in ngrams:
        fdist.update(list(entry))

    lookup = {}
    for ngram in fdist:        
        key =  ngram[:-1]
        word = ngram[-1]
        count = fdist[ngram]

        if key not in lookup:
            lookup[key] = {}
        
        lookup[key][word] = count
    return lookup
Beispiel #20
0
def updateCategoryDictionary(category):
    tweetList = twitter_fetch.get_new_tweets(classn=category)
    freq = FreqDist()
    tmpDict = FreqDist()

    for tweet in tweetList:
        freq.update(word for word in tokenizeTweet(tweet))

    try:
        oldDict = readDictionaryFromFile(category + categoryDictFilePath)
    except:
        newDict = buildCategoryDictionary(category)
        return newDict

    oldDict.update(freq)
    saveDictionaryToFile(oldDict, category + categoryDictFilePath)
    return oldDict
Beispiel #21
0
def standard_log_key(log_key_sequence_str):
    # 将日志键, 通过滑动窗口分为一个一个日志序列,这里将其分为4个日志键为一个序列
    tokens = log_key_sequence_str.split(' ')
    # 将日志键其变为int
    tokens = [int(i) for i in tokens]
    K = max(tokens)+1  # 日志键的种类个数
    # print("the tokens are:",tokens)
    bigramfdist_4 = FreqDist()
    bigrams_4 = ngrams(tokens, 4)
    # from nltk.util import ngrams
    # a = ['1', '2', '3', '4', '5']
    # b = ngrams(a, 2)
    # for i in b:
    #     print
    #     i
    # ('1', '2')
    # ('2', '3')
    # ('3', '4')
    # ('4', '5')
    bigramfdist_4.update(bigrams_4)
    print("the bigramfdsit_4 is:", list(bigramfdist_4.keys()))
    # we set the length of history logs as 3
    seq = np.array(list(bigramfdist_4.keys()))

    # print("the seq is:",seq)
    X, Y = seq[:, :3], seq[:, 3:4]
    # print(seq.shape)   # (253, 4)
    # print(X_normal.shape)  # (253, 3)
    # print(Y_normal.shape)  # (253, 1)
    X = np.reshape(X, (-1, 3, 1))
    # print(X_normal)
    # [[[6]
    #   [72]
    #   [6]]
    #
    #  [[72]
    #     [6]
    #     [6]]
    #  ...]
    # 将数字等比缩小,变为从0到1
    X = X / K
    # 将整型标签转为onehot
    num_classes = len(list(set(Y.T.tolist()[0]))) + 1 # num_classes指的是Y_normal的种类
    Y = keras.utils.to_categorical(Y)   # num_classes=num_classes
    return X, Y
def char_ngram_stats(texts, corpus, order, include_lower=False):
    '''
    Find character n-grams in some texts.
    @param texts: List of texts
    @param corpus: The corpus that holds the texts
    @param order: The order of the n-grams to consider.
    @param include_lower: Whether to include list of lower-order n-grams in output
    @return: A tuple: First element is a list of all n-grams (only of
    given order) across all texts. Second element is a a matrix with a list of
    lists of 1-grams, 2-grams, ..., n-grams per text.
    ''' 
    
    all_char_ngrams = FreqDist()
    text_char_ngrams = [] # Char n-grams found in each text
    
    for text in texts:
        
        if not text.endswith(".txt"):
            continue
        
        empty = len(corpus.raw(text)) == 0 
    
        # One freq. dist per n
        text_ngrams = []
        for _ in range(order):
            text_ngrams.append(FreqDist())
            
        if not empty:
            text_str = corpus.raw(text).replace('\r','').replace('\n', ' ')
            
            if include_lower:
                for n in range(1, order+1):
                    char_ng = ngrams(text_str, n)
                    text_ngrams[n-1].update(char_ng)
                    if n == order:
                        all_char_ngrams.update(char_ng)
            else:
                char_ng = ngrams(text_str, order)
                text_ngrams[order-1].update(char_ng)
                all_char_ngrams.update(char_ng)
            
        text_char_ngrams.append(text_ngrams)
            
    return all_char_ngrams, text_char_ngrams
def get_train(log_key_sequence_str):
    #     # we have the sequence of log keys
    #     seq = np.array(log_key_sequence)
    # divide the log sequence into 4 for every unit
    tokens = log_key_sequence_str.split(' ')
    for i in range(len(tokens)):
        tokens[i] = tokens[i].replace('E', '')
        tokens[i] = int(tokens[i])
    #     print("the tokens are:",tokens)
    bigramfdist_4 = FreqDist()
    bigrams_4 = ngrams(tokens, 4)

    bigramfdist_4.update(bigrams_4)
    # print("the bigramfdsit_4 is:",bigramfdist_4.keys())
    # we set the length of history logs as 3
    seq = np.array(list(bigramfdist_4.keys()))
    # print("the seq is:",seq)
    X, Y = seq[:, :3], seq[:, 3:4]

    return X, Y
def reduce_tweets(t1, t2):
    tags = FreqDist(t1[0])
    tags.update(t2[0])
    
    words = FreqDist(t1[1])
    words.update(t2[1])
    
    places = FreqDist(t1[2])
    places.update(t2[2])
    
    bigrams = FreqDist(t1[3])
    bigrams.update(t2[3])
    
    return tags, words, places, bigrams
Beispiel #25
0
def reduce_tweets(t1, t2):
    tags = FreqDist(t1[0])
    tags.update(t2[0])

    words = FreqDist(t1[1])
    words.update(t2[1])

    places = FreqDist(t1[2])
    places.update(t2[2])

    bigrams = FreqDist(t1[3])
    bigrams.update(t2[3])

    return tags, words, places, bigrams
Beispiel #26
0
def word_count(drug=None,limit=None,pos_filter=False,lemma=True):
	"""Scans comment texts (from drug_mentions.texts) for selected drug, 
	calculates most common words.

	KWARGS:
		drug: string or None.
			Drug selector.  Allows three cases:
			* None: scrape all comments in database, regardless of drug.
			* 'antidepressant': select comments speaking generically about
				drug, not referencing specific drug.
			* [drug name]: comments referencing specific drug.
			Default None.  Passed to drug_mentions.texts.
		limit: int or None.
			Optional limit on SQL queries retrieved by drug_mentions.texts. 
			Defaults to None (returns all hits).
		pos_filter: boolean.
			Passed to tokenize(), set True to use part-of-speech filtering.
		lemma: boolean.
			Passed to tokenize(), set True to use lemmatization.

	RETURNS:
		freq: nltk.probability.FreqDist object.
			Frequency distribution of words from comments.

	RAISES:
		ValueError:
			for invalid drug name.
	"""
	try:
		texts = dm.texts(drug=drug,limit=limit)
	except ValueError:
		raise ValueError('Invalid drug name.')

	freq = FreqDist()
	for text in texts:
		freq.update(tokenize(text,drug,pos_filter=pos_filter,lemma=lemma))

	return freq
def buildGoogleUnigram( ):
    DirPrefix = "/home/jcavalie/googleNgrams_unigrams/"

    unigramFiles = os.listdir( DirPrefix )

    unigramFiles = list( map( lambda _fileName: DirPrefix + _fileName, unigramFiles ) )

    masterUnigram = FreqDist( )

    with multiprocessing.Pool( 8, initializer = initProcess ) as ProcessPool:
        resAsync = ProcessPool.map_async( _buildUnigram, unigramFiles )
        results = resAsync.get( )

    ProcessPool.join( )

    print( "all jobs finished, building master unigram" )
    for freqdist in results:
        masterUnigram.update( freqdist )

    with open( "PickledData/GoogleUnigram.pickle", 'wb' ) as pklFile:
        pickle.dump( masterUnigram, pklFile, pickle.HIGHEST_PROTOCOL )

    return
Beispiel #28
0
def text_to_vector(docs, metric):
    """ Create frequency based feature-vector from text

    Metric must be either :attr:`FrequencyMetrics.TF` or :attr:`FrequencyMetrics.TF_IDF`.
    """
    doc_freqs = FreqDist(
    )  # Distribution over how many documents each word appear in.
    tf_dists = []  # List of TF distributions per document

    # Create freq_dist for each document
    for doc in docs:
        doc = preprocess.preprocess_text(doc)
        fd = FreqDist()
        for word in doc:
            fd.inc(word)
        doc_freqs.update(fd.samples())
        tf_dists.append(fd)

    all_tokens = doc_freqs.keys()
    num_docs = len(docs)
    num_features = len(all_tokens)

    # Build feature x document matrix
    matrix = np.zeros((num_features, num_docs))
    for i, fd in enumerate(tf_dists):
        if metric == FrequencyMetrics.TF:
            v = [fd.freq(word) for word in all_tokens]
        elif metric == FrequencyMetrics.TF_IDF:
            v = [
                fd.freq(word) * math.log(float(num_docs) / doc_freqs[word])
                for word in all_tokens
            ]
        else:
            raise ValueError("No such feature type: %s" % feature_type)
        matrix[:, i] = v

    return matrix
def buildGoogleUnigram():
    DirPrefix = "/home/jcavalie/googleNgrams_unigrams/"

    unigramFiles = os.listdir(DirPrefix)

    unigramFiles = list(
        map(lambda _fileName: DirPrefix + _fileName, unigramFiles))

    masterUnigram = FreqDist()

    with multiprocessing.Pool(8, initializer=initProcess) as ProcessPool:
        resAsync = ProcessPool.map_async(_buildUnigram, unigramFiles)
        results = resAsync.get()

    ProcessPool.join()

    print("all jobs finished, building master unigram")
    for freqdist in results:
        masterUnigram.update(freqdist)

    with open("PickledData/GoogleUnigram.pickle", 'wb') as pklFile:
        pickle.dump(masterUnigram, pklFile, pickle.HIGHEST_PROTOCOL)

    return
Beispiel #30
0
class AddAlphaBigramModel():
    def __init__(self, alpha=0.1):
        self.vocabulary=set()
        self.V = 0
        self.bigrams=ConditionalFreqDist([])
        self.unigrams=FreqDist([])
        self.alpha = 0.1
    def train(self):
        self.vocabulary=set()
        
        this_bigrams=[]
        self.unigrams = FreqDist([])
        
        for fileid in gutenberg.fileids():
            for sentence in gutenberg.sents(fileid):
                words=["<s>",] + [x.lower() for x in sentence if wordRE.search(x)] + ["</s>",]
                this_bigrams += bigrams(words)
                self.vocabulary.update(words)
                self.unigrams.update(words)
        self.bigrams=ConditionalFreqDist(this_bigrams)
        self.V = len(self.vocabulary)
        
    def bigram_prob(self, w1, w2):
        numerator = self.bigrams[w1][w2] + self.alpha
        denominator = self.bigrams[w1].N() + (self.alpha * self.V)
        retval= math.log(numerator / denominator)

        return retval

    def unigram_prob(self, w):
        numerator = self.unigrams[w] + self.alpha
        denominator = self.unigrams.N() + (self.alpha * self.V)
        return math.log(numerator/denominator)
    
    def __contains__(self, w):
        return w in self.vocabulary
Beispiel #31
0
 def _create_vocabulary(self):
     """Analyze all the text sentences in the data set and create a vocabulary:
     1. The dataset vocabulary
     2. Number of words in the vocabulary
     3. Length of the longest sentence """
     frequencies = FreqDist()
     max_sentence_length = 0
     for idx in range(self.__len__()):
         txt_path = os.path.join(self.text_dir_path,
                                 self.images_df.iloc[idx].path + ".txt")
         with open(txt_path, "r") as f:
             for line in f:
                 tokens = [
                     token.lower()
                     for token in self.tokenizer.tokenize(line)
                 ]
                 if len(tokens) > max_sentence_length:
                     max_sentence_length = len(tokens)
                 frequencies.update(tokens)
     # Finally, create the vocabulary object from the torchtext library.
     vocabulary = Vocab(frequencies,
                        min_freq=2,
                        specials=["<unk>", "<eos>"])
     return vocabulary, len(vocabulary.itos), max_sentence_length
Beispiel #32
0
def process_documents(path, html_conf):
    logging.info("Using documents from \"" + path + "\" directory ")
    
    if path[-1] != "/" :
        path + "/"

    documents = {}
    allterms  = {}
    listing   = os.listdir(path)
    allfreq   = FreqDist()

    # retriving document content - discarding structure
    logging.info("Processing files...")
    for infile in listing:
        logging.info("\tReading document " + infile)
        raw_doc     = open(path + infile, 'r').read()
        nonhtml_doc = nltk.clean_html(raw_doc)
        word_list   = nltk.word_tokenize(re.sub('[^A-Za-z0-9 ]', ' ', raw_doc))
        terms_list  = [ x.lower() for x in word_list if x.lower() not in stopwords.words('english')]

        stemmes = steming(terms_list)

        for stem in stemmes :
            allterms[stem] = 0

        fdist = FreqDist(word.lower() for word in stemmes)
        allfreq.update(word.lower() for word in stemmes)

        htmldist = evaluate_html(raw_doc.lower(), html_conf)
        fdist.update(htmldist)
        allfreq.update(htmldist)
    
        documents[infile] = { 'docname': infile,  'terms': stemmes, 'tf': fdist, 'tfidf': None  }

    for key, doc in documents.iteritems():
        doctfidf = compute_tfidf(doc ,documents)
        documents[key]['tfidf'] = dict(allterms.items() + doctfidf.items())

    return documents, allfreq
f.close()
banset = set(stoplist)

count = 0
for hotel in wordlists.fileids():
 print hotel
 list1 = wordlists.words(hotel)
 list2 = []
 for w in list1:
  list2.append(w)
 list3 = [w.strip() for w in list2]

 if(count==0):
  fdict = FreqDist(list3)
 else:
  fdict.update(list3)

 count+=1
 print len(fdict)

fdict2=fdict.copy()

for w in fdict.keys()[:]:
 if w.strip() in banset or len(w.strip()) < 3 or len(w.strip()) > 25:
  del fdict2[w]
 elif isinstance(w, unicode):
  del fdict2[w]

for w in fdict2.keys():
 if len(w) < 3:
  print w, len(w)
                                         testsets['neutral'])
    classifier.show_most_informative_features()


def word_feats(words):
    return dict([(word, True) for word in words])


print 'evaluating single word features'
evaluate_classifier(word_feats)

word_fd = FreqDist()
label_word_fd = ConditionalFreqDist()

for word in tweets.words(categories=['pos']):
    word_fd.update([word.lower()])
    label_word_fd['pos'].update([word.lower()])

for word in tweets.words(categories=['neg']):
    word_fd.update([word.lower()])
    label_word_fd['neg'].update([word.lower()])

for word in tweets.words(categories=['neutral']):
    word_fd.update([word.lower()])
    label_word_fd['neutral'].update([word.lower()])

# n_ii = label_word_fd[label][word]
# n_ix = word_fd[word]
# n_xi = label_word_fd[label].N()
# n_xx = label_word_fd.N()
Beispiel #35
0
    # featureBigramNeg.append(helperFuntions.bigramReturner(xNew))
    featureUnigramNeg.append(helperFuntions.getFeatureVector(xNew))
    # break

for x in dataPosTrain:
    # print x
    xNew = helperFuntions.removePunctuation(x)
    xNew = helperFuntions.toLower(xNew)
    xNew = helperFuntions.removeNumbers(xNew)
    xNew = helperFuntions.removeStopWords(xNew)
    # featureBigramPos.append(helperFuntions.bigramReturner(xNew))
    featureUnigramPos.append(helperFuntions.getFeatureVector(xNew))
    # break

for word in featureUnigramPos:
    word_fd.update(word)
    label_word_fd['pos'].update(word)

for word in featureUnigramNeg:
    word_fd.update(word)
    label_word_fd['neg'].update(word)

# print featureBigramPos
# print featureUnigramPos
pos_word_count = label_word_fd['pos'].N()
neg_word_count = label_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count

word_scores = {}

for word, freq in word_fd.iteritems():
Beispiel #36
0
class EditDistanceFinder():  
    def __init__(self):
        self.char_probs = ConditionalProbDist([],MLEProbDist)
        self.bichar_freqs = ConditionalFreqDist([])
        self.transp_freqs = FreqDist()
        self.DOWN,self.LEFT,self.DIAG,self.DOUBLE_DIAG = range(4)
        self.INSERT, self.DELETE, self.SUBST, self.TRANSP = range(4)
        
    def train(self, fname):
        misspellings=[]
        for line in open(fname):
            line=line.strip()
            if not(line): continue
            w1, w2 = line.split(",")
            misspellings.append((w1.strip(),w2.strip()))
       
        last_alignments = None
        done = False
        while not done:
            print("Iteration")
            alignments, bigrams = self.train_alignments(misspellings)
            self.train_costs(alignments, bigrams)
            done = (alignments == last_alignments)
            last_alignments = alignments
            
    def train_alignments(self, misspellings):
        alignments = []
        self.bichar_freqs = FreqDist()

        for error, corrected in misspellings:
            distance, this_alignments = self.align(corrected, error)
            alignments += this_alignments
            bigrams = [corrected[i:i+2] for i in range(len(corrected)-1)]
            self.bichar_freqs.update(bigrams)
            
        return alignments,bigrams
    
    def train_costs(self, alignments,bigrams):
        add_one_aligns = [(a,b) for a in string.ascii_lowercase for b in string.ascii_lowercase]
        single_aligns = [(a,b) for a,b in alignments if len(a) < 2]
        
        char_aligns = ConditionalFreqDist(single_aligns + add_one_aligns)
        self.char_probs = ConditionalProbDist(char_aligns, MLEProbDist)
        
        double_aligns = [a for a,b in alignments if len(a) >= 2]
        self.transp_freqs = FreqDist(double_aligns)

    def align(self, w1, w2, verbose=False):
        M = len(w1) +1
        N = len(w2) +1
        table = numpy.zeros((M,N))
        backtrace = numpy.zeros((M,N))
    
        for i in range(1,M):
            w1_char = w1[i-1]
            table[i,0] = table[i-1,0] + self.del_cost(w1_char)
            backtrace[i,0] = self.DOWN
        for j in range(1,N):
            w2_char = w2[j-1]
            backtrace[0,j] = self.LEFT
            table[0,j] = table[0,j-1] + self.ins_cost(w2_char)   
    
        for i in range(1,M):
            w1_char = w1[i-1]
            for j in range(1,N):
                w2_char = w2[j-1]

                this_del = table[i-1,j] + self.del_cost(w1_char)
                this_ins = table[i,j-1] + self.ins_cost(w2_char)
                this_sub = table[i-1,j-1] + self.sub_cost(w1_char,w2_char)
                
                if j > 1 and i > 1 and w1[i-1] == w2[j-2] and w1[i-2]==w2[j-1] and w1[i-1] != w1[i-2]:
                    this_transp = table[i-2,j-2] + self.transp_cost(w1_char, w2_char)
                else:
                    this_transp = 999999
            
                min_cost = min(this_del, this_ins, this_sub, this_transp)
                table[i,j] = min_cost

                if this_sub == min_cost:
                    backtrace[i,j] = self.DIAG
                elif this_transp == min_cost:
                    backtrace[i,j] = self.DOUBLE_DIAG
                elif this_ins == min_cost:
                    backtrace[i,j] = self.LEFT
                else: # insert
                    backtrace[i,j] = self.DOWN

                
        alignments = []
        i = M - 1    
        j = N - 1
        while (j or i):
            this_backtrace = backtrace[i,j]
            if this_backtrace == self.DIAG: # sub
                alignments.append((w1[i-1],w2[j-1]))
                i -= 1
                j -= 1
            elif this_backtrace == self.DOUBLE_DIAG:
                alignments.append((w1[i-2:i],w2[j-2:j]))
                i -= 2
                j -= 2
            elif this_backtrace == self.DOWN: # delete
                alignments.append((w1[i-1],"%"))
                i -= 1
            elif this_backtrace == self.LEFT: # insert
                alignments.append(("%",w2[j-1]))
                j -= 1

        alignments.reverse()
        if verbose:
            print(table)
        return table[M-1,N-1], alignments

    def transp_cost(self, char1, char2):
        ## how often do char1 and char2 get transposed?
        return 1 - self.transp_prob(char1,char2)
   
    def del_cost(self, char):
        return 1-self.char_probs[char].prob('%')
    def ins_cost(self, char):
        return 1-self.char_probs['%'].prob(char)
    def sub_cost(self, char1, char2):
        return 1-self.char_probs[char1].prob(char2)
    
    def transp_prob(self, char1, char2):
        numerator = self.transp_freqs[char1] + .1
        denominator = self.bichar_freqs[char1] + .1*26*26
        return numerator / denominator
    
    def prob(self, w1, w2):
        score, alignment = self.align(w1, w2)
        total_prob = 0
        for a, b in alignment:
            if len(a) > 1:
                total_prob += log(self.transp_prob(a[0],a[1]))
            else:
                total_prob += self.char_probs[a].logprob(b)
        return total_prob
    
    def show_alignment(self, alignments):
        print("String1:", " ".join([x[0] for x in alignments]))
        print("String2:", " ".join([x[1] for x in alignments]))
fin.close()


count=0
for hotel in wordlists.fileids():
 list4 = []
 print hotel
 taglist = tagger.tag(wordlists.words(hotel))
 list1 = find_chunk('CHUNK: {<JJ.*> <RB>* <NN.*>+}')
 list2 = find_chunk2('CHUNK: {<NN.*>+ <VB.*> <RB>* <JJ.*>}')
 list3 = find_chunk3('CHUNK: {<VB.*> <RB>* <JJ.*> <NN.*>}')
 list4 = list1 + list2 + list3
 if(count==0):
  fdict = FreqDist(list4)
 else:
  fdict.update(list4)
 count+=1
 print 'Size of dictionary:',len(fdict)
 print ''

f=open('stoplist.txt', 'r')
stoplist=[]
ban='IV'
while(ban!=''):
 ban=f.readline()
 stoplist.append(ban.strip())

f.close()
banset = set(stoplist)

fdict2=fdict.copy()
Beispiel #38
0
    def __init__(self, n, train, pad_left=False, pad_right=False,
                 estimator=None, *estimator_args, **estimator_kwargs):
        """
        Creates an ngram language model to capture patterns in n consecutive
        words of training text.  An estimator smooths the probabilities derived
        from the text and may allow generation of ngrams not seen during
        training.

        :param n: the order of the language model (ngram size)
        :type n: C{int}
        :param train: the training text
        :type train: C{iterable} of C{string} or C{iterable} of C{iterable} of C{string} 
        :param estimator: a function for generating a probability distribution---defaults to MLEProbDist
        :type estimator: a function that takes a C{ConditionalFreqDist} and
              returns a C{ConditionalProbDist}
        :param pad_left: whether to pad the left of each sentence with an (n-1)-gram of <s>
        :type pad_left: bool
        :param pad_right: whether to pad the right of each sentence with </s>
        :type pad_right: bool
        :param estimator_args: Extra arguments for estimator.
            These arguments are usually used to specify extra
            properties for the probability distributions of individual
            conditions, such as the number of bins they contain.
            Note: For backward-compatibility, if no arguments are specified, the
            number of bins in the underlying ConditionalFreqDist are passed to
            the estimator as an argument.
        :type estimator_args: (any)
        :param estimator_kwargs: Extra keyword arguments for the estimator
        :type estimator_kwargs: (any)
        """

        # protection from cryptic behavior for calling programs
        # that use the pre-2.0.2 interface
        assert(isinstance(pad_left, bool))
        assert(isinstance(pad_right, bool))

        # make sure n is greater than zero, otherwise print it
        assert (n > 0), n

        # For explicitness save the check whether this is a unigram model
        self.is_unigram_model = (n == 1)
        # save the ngram order number
        self._n = n
        # save left and right padding
        self._lpad = ('<s>',) * (n - 1) if pad_left else ()
        # Need _rpad even for unigrams or padded entropy will give
        #  wrong answer because '</s>' will be treated as unseen...
        self._rpad = ('</s>',) if pad_right else ()
        self._padLen = len(self._lpad)+len(self._rpad)

        self._N=0
        delta = 1+self._padLen-n        # len(sent)+delta == ngrams in sent

        if estimator is None:
            assert (estimator_args is ()) and (estimator_kwargs=={}),\
                   "estimator_args (%s) or _kwargs supplied (%s), but no estimator"%(estimator_args,estimator_kwargs)
            estimator = lambda fdist, bins: MLEProbDist(fdist)

        # Given backoff, a generator isn't acceptable
        if not isinstance(train,collections.abc.Sequence):
          train=list(train)
        self._W = len(train)
        # Coerce to list of list -- note that this means to train charGrams,
        #  requires exploding the words ahead of time 
        if train is not None:
            if isinstance(train[0], compat.string_types):
                train = [train]
                self._W=1
            elif not isinstance(train[0],collections.abc.Sequence):
                # if you mix strings and generators, you have only yourself
                #  to blame!
                for i in range(len(train)):
                    train[i]=list(train[i])

        if n == 1:
            if pad_right:
                sents=(chain(s,self._rpad) for s in train)
            else:
                sents=train
            fd=FreqDist()
            for s in sents:
                fd.update(s)
            if not estimator_args and not estimator_kwargs:
                self._model = estimator(fd,fd.B())
            else:
                self._model = estimator(fd,fd.B(),
                                        *estimator_args, **estimator_kwargs)
            self._N=fd.N()
        else:
            cfd = ConditionalFreqDist()
            self._ngrams = set()

            for sent in train:
                self._N+=len(sent)+delta
                for ngram in ingrams(chain(self._lpad, sent, self._rpad), n):
                    self._ngrams.add(ngram)
                    context = tuple(ngram[:-1])
                    token = ngram[-1]
                    cfd[context][token]+=1
            if not estimator_args and not estimator_kwargs:
                self._model = ConditionalProbDist(cfd, estimator, len(cfd))
            else:
                self._model = ConditionalProbDist(cfd, estimator, *estimator_args, **estimator_kwargs)

        # recursively construct the lower-order models
        if not self.is_unigram_model:
            self._backoff = NgramModel(n-1, train,
                                        pad_left, pad_right,
                                        estimator,
                                        *estimator_args,
                                        **estimator_kwargs)

            # Code below here in this method, and the _words_following and _alpha method, are from
            # http://www.nltk.org/_modules/nltk/model/ngram.html "Last updated on Feb 26, 2015"
            self._backoff_alphas = dict()
            # For each condition (or context)
            for ctxt in cfd.conditions():
                backoff_ctxt = ctxt[1:]
                backoff_total_pr = 0.0
                total_observed_pr = 0.0

                # this is the subset of words that we OBSERVED following
                # this context.
                # i.e. Count(word | context) > 0
                for word in self._words_following(ctxt, cfd):
                    total_observed_pr += self.prob(word, ctxt)
                    # we also need the total (n-1)-gram probability of
                    # words observed in this n-gram context
                    backoff_total_pr += self._backoff.prob(word, backoff_ctxt)
                if isclose(total_observed_pr,1.0):
                    total_observed_pr=1.0
                else:
                    assert 0.0 <= total_observed_pr <= 1.0,\
                           "sum of probs for %s out of bounds: %.10g"%(ctxt,total_observed_pr)
                # beta is the remaining probability weight after we factor out
                # the probability of observed words.
                # As a sanity check, both total_observed_pr and backoff_total_pr
                # must be GE 0, since probabilities are never negative
                beta = 1.0 - total_observed_pr

                if beta!=0.0:
                    assert (0.0 <= backoff_total_pr < 1.0), \
                           "sum of backoff probs for %s out of bounds: %s"%(ctxt,backoff_total_pr)
                    alpha_ctxt = beta / (1.0 - backoff_total_pr)
                else:
                    assert ((0.0 <= backoff_total_pr < 1.0) or
                            isclose(1.0,backoff_total_pr)), \
                           "sum of backoff probs for %s out of bounds: %s"%(ctxt,backoff_total_pr)
                    alpha_ctxt = 0.0

                self._backoff_alphas[ctxt] = alpha_ctxt
Beispiel #39
0
class TwitterCorpus(object):
    def __init__(self, args):
        self.dictionary = Dictionary()
        self.dictionary.add_word("<<<padding>>>")
        self.padding_value = self.dictionary.word2idx["<<<padding>>>"]
        self.max_vocab_size = args.max_vocab_size

        self.fdist = FreqDist()
        self.file_prepared = False
        self.username_re = re.compile("\@[\w]+")
        self.url_re = re.compile("http[s]?://[\w|\.|\?|\/]+")
        self.www_re = re.compile("www.[^ ]+")
        self.emoticon_re = re.compile(
            "(;D)|(:D)|(:/)|(=\))|(:-D)|(;-D)|(:\()|(=\()|(:\s{1}\()")
        self.run_on_re = re.compile(r"(\w)\1{2,}", re.DOTALL)
        self.negations_dic = {
            "isn't": "is not",
            "aren't": "are not",
            "wasn't": "was not",
            "weren't": "were not",
            "haven't": "have not",
            "hasn't": "has not",
            "hadn't": "had not",
            "won't": "will not",
            "wouldn't": "would not",
            "don't": "do not",
            "doesn't": "does not",
            "didn't": "did not",
            "can't": "can not",
            "couldn't": "could not",
            "shouldn't": "should not",
            "mightn't": "might not",
            "mustn't": "must not"
        }
        self.neg_pattern = re.compile(r'\b(' +
                                      '|'.join(self.negations_dic.keys()) +
                                      r')\b')

        self.datafile = os.path.join(args.data, "tweet_data.h5")
        self.data_handle = h5py.File(os.path.join(args.data, "tweet_data.h5"),
                                     'w')

        self.prepare_dataset(args.training, 'training')
        self.prepare_dataset(args.testing, 'testing')
        self.data_handle.close()
        self.file_prepared = True

    def __getstate__(self):
        ''' Do not pickle the handle to the h5 file '''
        state = self.__dict__.copy()
        del state['data_handle']
        return state

    def __setstate__(self, state):

        self.__dict__.update(state)
        if os.path.exists(self.datafile):
            self.file_prepared = True
        else:
            self.file_prepared = False

    def get_padding_idx(self):
        return self.padding_value

    def get_data_file(self):
        if self.file_prepared:
            return self.datafile
        else:
            print(
                'File is not prepared.  Re-build TwitterCorpus object properly.',
                file=sys.stderr)

    def prepare_dataset(self, path, data_split):
        """ Preprocess the dataset in `path` 
        data_split \in ['training','testing'] """

        outpath = path.replace(".csv", ".prepared.csv")
        self._make_freqdist(path)
        tokens, max_len, num_tweets = self._preprocess_and_build_dictionary(
            path, outpath)
        self._pack_to_h5(outpath, data_split, tokens, max_len, num_tweets)

    def _process_tweet(self, tweet):
        """ Apply feature transformations to each tweet in the dataset """

        # unique tokens with this, no depunct: 755992
        # removing single char tokens, expanding contractions: 277990
        # target vocab should be 76643, size reported in Kalchbrenner & Gref & Blunsom

        tweet = tweet.strip()
        tweet = BeautifulSoup(tweet, 'lxml').get_text()
        tweet = tweet.replace(u"\ufffd", "?")
        # @usernames -> USERNAME
        tweet = re.sub(self.username_re, lambda x: "USERNAME", tweet)
        # URLS -> URL
        tweet = re.sub(self.url_re, lambda x: "URL", tweet)
        # www. URLs -> URL
        tweet = re.sub(self.www_re, lambda x: "URL", tweet)
        # expand negation contractions
        tweet = re.sub(self.neg_pattern,
                       lambda x: self.negations_dic[x.group()], tweet)

        # standardize emoticons
        tweet = re.sub(self.emoticon_re, lambda x: "", tweet)
        # shrink extended runs of any char
        tweet = re.sub(self.run_on_re, r"\1\1",
                       tweet)  # result = re.sub("(\d+) (\w+)", r"\2 \1")

        return tweet

    def _make_freqdist(self, path):
        """ Read all the tweets, calculate the frequencies of 
        the words appearing in processed tweets """

        translator = str.maketrans('', '', string.punctuation)
        print("Counting words in training...")
        with open(path, 'r', encoding='utf-8', errors='replace') as f:
            tweet_reader = csv.reader(f, delimiter=',', quotechar='"')
            for i, parts in enumerate(tweet_reader):
                tweet = parts[-1]
                clean_tweet = self._process_tweet(tweet)
                lc_clean_tweet = clean_tweet.lower()
                words = [
                    w for w in lc_clean_tweet.translate(translator).split()
                    if len(w) > 1
                ]
                self.fdist.update(words)
                if i % 10000 == 0:
                    print("Processed first ", i, "tweets")

    def _preprocess_and_build_dictionary(self, inpath, outpath, depunct=True):
        """ Preprocess the Twitter Sentiment data set in `inpath`,
        build the dictionary, and write the sanitized output to 
        `outpath`.  In addition, return how many unique tokens
        we see in the corpus. 
        
        return the number of unique tokens seen, the largest number
        of words seen in a single tweet, and the number of tweets 
        in this file.
        """

        assert os.path.exists(inpath)
        if depunct:
            translator = str.maketrans('', '', string.punctuation)

        with open(inpath, 'r', encoding='utf-8-sig',
                  errors='replace') as in_f, open(outpath,
                                                  'w',
                                                  encoding='utf-8') as out_f:
            tweet_reader = csv.reader(in_f, delimiter=',', quotechar='"')
            tweet_writer = csv.writer(out_f, delimiter=',', quotechar='"')
            max_len = 0
            vocab_words = frozenset(
                [w for w, c in self.fdist.most_common(self.max_vocab_size)])

            for i, parts in enumerate(tweet_reader):
                if (i % 10000) == 0:
                    print("Finished tweet ", i)
                tweet = parts[-1]
                clean_tweet = self._process_tweet(tweet)
                lc_clean_tweet = clean_tweet.lower()
                words = [
                    w for w in lc_clean_tweet.translate(translator).split()
                    if len(w) > 1 and w in vocab_words
                ]
                max_len = len(words) if len(words) > max_len else max_len

                for word in words:
                    self.dictionary.add_word(word)

                clean_line = parts[:-1] + [" ".join(words)]
                tweet_writer.writerow(clean_line)

        unique_tokens = len(self.dictionary)
        return unique_tokens, max_len, i + 1

    def _tweet_to_list(self, parts, max_len):
        label, tweet = parts[0], parts[-1]

        try:
            label = int(label)
        except ValueError:
            print('Cannot coerce ', label, ' to int ')
            label = -1
        words = tweet.split()
        encoded_words = [self.dictionary.word2idx[word] for word in words]
        encoded_words = encoded_words + [
            self.padding_value for i in range(max_len - len(words))
        ]
        assert (len(encoded_words) == max_len)
        return encoded_words, label

    def _calculate_amount_to_write(self, chunk, chunk_size, num_examples):
        amount_to_write = num_examples - (chunk * chunk_size)
        if amount_to_write < 0:
            amount_to_write = num_examples
        if amount_to_write < chunk_size:
            return amount_to_write
        else:
            return chunk_size

    def _pack_to_h5(self, path, group, tokens, max_len, num_examples):
        """ Build the word2idx data structure for the Twitter Sentiment data set in `path` 
        I'll use an hdf5 file to store the embedded seqs, labels.
        
        path := path to cleaned up tweet file
        tokens := number of tokens to encode
        group := 'training' or 'testing', which group in the h5 file
                do we encode the data from `path`
        max_len := most number of words observed in a tweet
        num_examples := number of tweets in this file in `path`
        """

        assert os.path.exists(path)

        # create groups for data, labels
        group_name = '/' + group
        this_group = self.data_handle.create_group(group_name)
        data_name = group + "_data"
        label_name = group + "_labels"

        chunk = 0
        chunk_size = 10000
        buffer_size = self._calculate_amount_to_write(chunk, chunk_size,
                                                      num_examples)

        data = this_group.create_dataset(data_name,
                                         shape=(num_examples, max_len),
                                         chunks=(buffer_size, max_len),
                                         dtype=np.int32)
        labels = this_group.create_dataset(label_name,
                                           shape=(num_examples, 1),
                                           dtype=np.int32)

        # parse, encode words in each tweet, write to h5file
        temp_array = np.empty((chunk_size, max_len), dtype=np.int32)
        temp_labels = np.empty((chunk_size, 1), dtype=np.int32)

        with open(path, 'r', encoding='utf-8-sig', errors='replace') as f:
            ids = torch.LongTensor(tokens)
            token = 0
            tweet_reader = csv.reader(f, delimiter=',', quotechar='"')
            for i, parts in enumerate(tweet_reader):
                embedded_list, label = self._tweet_to_list(parts, max_len)
                temp_array[i % chunk_size, :] = np.array(embedded_list)
                temp_labels[i % chunk_size, 0] = label
                if (i + 1) % buffer_size == 0:
                    # write the buffer to the h5file
                    data[chunk * chunk_size:chunk * chunk_size +
                         buffer_size, :] = temp_array[0:buffer_size, :]
                    labels[chunk * chunk_size:chunk * chunk_size + buffer_size,
                           0] = temp_labels[0:buffer_size, 0]
                    chunk += 1
                    buffer_size = self._calculate_amount_to_write(
                        chunk, chunk_size, num_examples)
Beispiel #40
0
        threestars.append(review)
    if stars[i] == 2:
        twostars.append(review)
    if stars[i] == 1:
        onestars.append(review)
    i = i + 1

word_fd = FreqDist()
label_word_fd = ConditionalFreqDist()

print 'Getting words...'
for review in fivestars:
    if type(review) is str:
        for word in review.split():
            if word not in stop:
                word_fd.update(stemmer.stem(word.decode('utf-8')).lower())
                label_word_fd['5'].update(
                    stemmer.stem(word.decode('utf-8')).lower())

for review in fourstars:
    if type(review) is str:
        for word in review.split():
            word_fd.update(stemmer.stem(word.decode('utf-8')).lower())
            label_word_fd['4'].update(
                stemmer.stem(word.decode('utf-8')).lower())

for review in threestars:
    if type(review) is str:
        for word in review.split():
            word_fd.update(stemmer.stem(word.decode('utf-8')).lower())
            label_word_fd['3'].update(
Beispiel #41
0
class EditDistanceFinder():
    def __init__(self):
        self.char_probs = ConditionalProbDist([], MLEProbDist)
        self.bichar_freqs = ConditionalFreqDist([])
        self.transp_freqs = FreqDist()
        self.DOWN, self.LEFT, self.DIAG, self.DOUBLE_DIAG = range(4)
        self.INSERT, self.DELETE, self.SUBST, self.TRANSP = range(4)

    def train(self, fname):
        misspellings = []
        for line in open(fname):
            line = line.strip()
            if not (line): continue
            w1, w2 = line.split(",")
            misspellings.append((w1.strip(), w2.strip()))

        last_alignments = None
        done = False
        while not done:
            print("Iteration")
            alignments, bigrams = self.train_alignments(misspellings)
            self.train_costs(alignments, bigrams)
            done = (alignments == last_alignments)
            last_alignments = alignments

    def train_alignments(self, misspellings):
        alignments = []
        self.bichar_freqs = FreqDist()

        for error, corrected in misspellings:
            distance, this_alignments = self.align(corrected, error)
            alignments += this_alignments
            bigrams = [corrected[i:i + 2] for i in range(len(corrected) - 1)]
            self.bichar_freqs.update(bigrams)

        return alignments, bigrams

    def train_costs(self, alignments, bigrams):
        add_one_aligns = [(a, b) for a in string.ascii_lowercase
                          for b in string.ascii_lowercase]
        single_aligns = [(a, b) for a, b in alignments if len(a) < 2]

        char_aligns = ConditionalFreqDist(single_aligns + add_one_aligns)
        self.char_probs = ConditionalProbDist(char_aligns, MLEProbDist)

        double_aligns = [a for a, b in alignments if len(a) >= 2]
        self.transp_freqs = FreqDist(double_aligns)

    def align(self, w1, w2, verbose=False):
        M = len(w1) + 1
        N = len(w2) + 1
        table = numpy.zeros((M, N))
        backtrace = numpy.zeros((M, N))

        for i in range(1, M):
            w1_char = w1[i - 1]
            table[i, 0] = table[i - 1, 0] + self.del_cost(w1_char)
            backtrace[i, 0] = self.DOWN
        for j in range(1, N):
            w2_char = w2[j - 1]
            backtrace[0, j] = self.LEFT
            table[0, j] = table[0, j - 1] + self.ins_cost(w2_char)

        for i in range(1, M):
            w1_char = w1[i - 1]
            for j in range(1, N):
                w2_char = w2[j - 1]

                this_del = table[i - 1, j] + self.del_cost(w1_char)
                this_ins = table[i, j - 1] + self.ins_cost(w2_char)
                this_sub = table[i - 1, j - 1] + self.sub_cost(
                    w1_char, w2_char)

                if j > 1 and i > 1 and w1[i - 1] == w2[j - 2] and w1[
                        i - 2] == w2[j - 1] and w1[i - 1] != w1[i - 2]:
                    this_transp = table[i - 2, j - 2] + self.transp_cost(
                        w1_char, w2_char)
                else:
                    this_transp = 999999

                min_cost = min(this_del, this_ins, this_sub, this_transp)
                table[i, j] = min_cost

                if this_sub == min_cost:
                    backtrace[i, j] = self.DIAG
                elif this_transp == min_cost:
                    backtrace[i, j] = self.DOUBLE_DIAG
                elif this_ins == min_cost:
                    backtrace[i, j] = self.LEFT
                else:  # insert
                    backtrace[i, j] = self.DOWN

        alignments = []
        i = M - 1
        j = N - 1
        while (j or i):
            this_backtrace = backtrace[i, j]
            if this_backtrace == self.DIAG:  # sub
                alignments.append((w1[i - 1], w2[j - 1]))
                i -= 1
                j -= 1
            elif this_backtrace == self.DOUBLE_DIAG:
                alignments.append((w1[i - 2:i], w2[j - 2:j]))
                i -= 2
                j -= 2
            elif this_backtrace == self.DOWN:  # delete
                alignments.append((w1[i - 1], "%"))
                i -= 1
            elif this_backtrace == self.LEFT:  # insert
                alignments.append(("%", w2[j - 1]))
                j -= 1

        alignments.reverse()
        if verbose:
            print(table)
        return table[M - 1, N - 1], alignments

    def transp_cost(self, char1, char2):
        ## how often do char1 and char2 get transposed?
        return 1 - self.transp_prob(char1, char2)

    def del_cost(self, char):
        return 1 - self.char_probs[char].prob('%')

    def ins_cost(self, char):
        return 1 - self.char_probs['%'].prob(char)

    def sub_cost(self, char1, char2):
        return 1 - self.char_probs[char1].prob(char2)

    def transp_prob(self, char1, char2):
        numerator = self.transp_freqs[char1] + .1
        denominator = self.bichar_freqs[char1] + .1 * 26 * 26
        return numerator / denominator

    def prob(self, w1, w2):
        score, alignment = self.align(w1, w2)
        total_prob = 0
        for a, b in alignment:
            if len(a) > 1:
                total_prob += log(self.transp_prob(a[0], a[1]))
            else:
                total_prob += self.char_probs[a].logprob(b)
        return total_prob

    def show_alignment(self, alignments):
        print("String1:", " ".join([x[0] for x in alignments]))
        print("String2:", " ".join([x[1] for x in alignments]))
Beispiel #42
0
                  'THEREFORE', 'THEY', 'THEY\'D', 'THEY\'LL', 'THEY\'RE',
                  'THIRD', 'THIRTEEN', 'THIRTEENTH', 'THIRTIETH', 'THIRTY',
                  'THIS', 'THITHER', 'THOSE', 'THOUGH', 'THOUSAND',
                  'THOUSANDTH', 'THREE', 'THRICE', 'THROUGH', 'THUS', 'TILL',
                  'TO', 'TOWARDS', 'TODAY', 'TOMORROW', 'TOO', 'TWELFTH',
                  'TWELVE', 'TWENTIETH', 'TWENTY', 'TWICE', 'TWO', 'UNDER',
                  'UNDERNEATH', 'UNLESS', 'UNTIL', 'UP', 'US', 'VERY', 'WHEN',
                  'WAS', 'WASN\'T', 'WE', 'WE\'D', 'WE\'LL', 'WERE', 'WE\'RE',
                  'WEREN\'T', 'WE\'VE', 'WHAT', 'WHENCE', 'WHERE', 'WHEREAS',
                  'WHICH', 'WHILE', 'WHITHER', 'WHO', 'WHOM', 'WHOSE', 'WHY',
                  'WILL', 'WITH', 'WITHIN', 'WITHOUT', 'WON\'T', 'WOULD',
                  'WOULDN\'T', 'YES', 'YESTERDAY', 'YET', 'YOU', 'YOUR',
                  'YOU\'D', 'YOU\'LL', 'YOU\'RE', 'YOURS', 'YOURSELF',
                  'YOURSELVES', 'YOU\'VE']

files = ['IN', 'IP', 'LY', 'NA', 'OP', 'SP']
stop_words = set([word.lower() for word in function_words])

for file in files:
    with open(file + '.txt', 'w') as my_file:
        for each in glob('Mini-CORE/1+' + file + '*.txt'):
#     for each in glob('Mini-CORE/1+', files, '*.txt'):
            with open(each, 'r') as read_file:
                fd = FreqDist()
                text = read_file.read().lower()
                cleaned_text = clean(text)
                tokens = nltk.word_tokenize(cleaned_text)
                words = [token for token in tokens if token not in stop_words]
                tokens_fd = FreqDist(words)
                fd.update(tokens_fd)
        print(fd.most_common(), file=my_file)