Esempi in Python per FreqDist.update, esempi in Python per nltk.probability.FreqDist.update

Esempio n. 1

0

Mostra file

File: freq_representation.py Progetto: himanshusapra9/TextNet

def text_to_dict(docs, metric):
    """ Create dictionaries of term frequencies based on documents

    Metric must be either :attr:`FrequencyMetrics.TF` or :attr:`FrequencyMetrics.TF_IDF`.
    """
    doc_freqs = FreqDist() # Distribution over how many documents each word appear in.
    tf_dists = [] # List of TF distributions per document

    # Create freq_dist for each document
    for doc in docs:
        doc = preprocess.preprocess_text(doc)
        fd = FreqDist()
        for word in doc: fd.inc(word)
        doc_freqs.update(fd.samples())
        tf_dists.append(fd)


    num_docs = len(docs)
    # Build dictionaries
    dicts = []
    for i, fd in enumerate(tf_dists):
        if i%100==0: print '    dict',str(i)+'/'+str(len(tf_dists))
        d = {}
        if metric == FrequencyMetrics.TF:
            for word in fd.samples():
                d[word] = fd.freq(word)
        elif metric == FrequencyMetrics.TF_IDF:
            for word in fd.samples():
                d[word] = fd.freq(word) * math.log(float(num_docs)/doc_freqs[word])
        else:
            raise ValueError("No such feature type: %s" % feature_type);
        dicts.append(d)
    return dicts

Esempio n. 2

0

Mostra file

def text_to_dict(docs, metric):
    """ Create dictionaries of term frequencies based on documents

    Metric must be either :attr:`FrequencyMetrics.TF` or :attr:`FrequencyMetrics.TF_IDF`.
    """
    doc_freqs = FreqDist(
    )  # Distribution over how many documents each word appear in.
    tf_dists = []  # List of TF distributions per document

    # Create freq_dist for each document
    for doc in docs:
        doc = preprocess.preprocess_text(doc)
        fd = FreqDist()
        for word in doc:
            fd.inc(word)
        doc_freqs.update(fd.samples())
        tf_dists.append(fd)

    num_docs = len(docs)
    # Build dictionaries
    dicts = []
    for i, fd in enumerate(tf_dists):
        if i % 100 == 0: print '    dict', str(i) + '/' + str(len(tf_dists))
        d = {}
        if metric == FrequencyMetrics.TF:
            for word in fd.samples():
                d[word] = fd.freq(word)
        elif metric == FrequencyMetrics.TF_IDF:
            for word in fd.samples():
                d[word] = fd.freq(word) * math.log(
                    float(num_docs) / doc_freqs[word])
        else:
            raise ValueError("No such feature type: %s" % feature_type)
        dicts.append(d)
    return dicts

Esempio n. 3

0

Mostra file

File: fextract_helper.py Progetto: rtaph/authorship-attribution

def wrd_ngram_stats(texts, corpus, order, include_lower=False):
    
    all_wd_ngrams = FreqDist()
    text_wrd_ngrams = []
    
    for text in texts:
        
        if not text.endswith(".txt"):
            continue
        
        wrd_tokens = corpus.words(text)
        empty = len(corpus.raw(text)) == 0
        
        # One freq. dist per n
        text_ngrams = []
        for _ in range(order):
            text_ngrams.append(FreqDist())

        if not empty:
            lower_wrds = [w.lower() for w in wrd_tokens if w.isalnum()]
            
            if include_lower:
                for n in range(1, order+1):
                    wd_ng = ngrams(lower_wrds, n)
                    text_ngrams[n-1].update(wd_ng)
                    if n == order:
                        all_wd_ngrams.update(wd_ng)
            else:
                wd_ng = ngrams(lower_wrds, order)
                text_ngrams[order-1].update(wd_ng)
                all_wd_ngrams.update(wd_ng)
                
        text_wrd_ngrams.append(text_ngrams)
            
    return all_wd_ngrams, text_wrd_ngrams

Esempio n. 4

0

Mostra file

File: preprocessing.py Progetto: pejotr/doc-clustering

def evaluate_html(content, html_conf):
    fdist = FreqDist()
    if html_conf['usehtml'] == False:
        logging.info('Discarding HTML tags')
        return fdist
 
    logging.info("\tEvaluating HTML")
     
    # try with TITLE tag
    titles = re.findall("<title>[A-Za-z0-9 ]+</title>", content)
    for title in titles:
        root = etree.fromstring(title)
        words_list = nltk.word_tokenize(re.sub('[^A-Za-z0-9 ]', ' ', root.text))
        terms_list = [ x for x in words_list if x.lower() not in stopwords.words('english')]
        stems = steming(terms_list)

        for i in range(html_conf['title']):
            fdist.update(stems)

    # try with H1 tag
    headers = re.findall("<h1>[A-Za-z0-9 ]+</h1>", content)
    for header in headers:
        root = etree.fromstring(header)
        words_list = nltk.word_tokenize(re.sub('[^A-Za-z0-9 ]', ' ', root.text))
        terms_list = [ x for x in words_list if x.lower() not in stopwords.words('english')]
        stems = steming(terms_list)

        for i in range(html_conf['h1']):
            fdist.update(stems)

    return fdist

Esempio n. 5

0

Mostra file

File: textMining.py Progetto: elms1990/twitter-ml

def buildCategoryDictionary(category):
    tweetList = twitter_fetch.get_tweets_text(classn=category)
    freq = FreqDist()
    for tweet in tweetList:
        freq.update(word for word in tokenizeTweet(tweet))
    saveDictionaryToFile(freq, category + categoryDictFilePath)
    return freq

Esempio n. 6

0

Mostra file

File: classifiers.py Progetto: efrenaguilar95/Yelp_Analyzer

def high_words(posids, negids, cutoff, score_fn=BigramAssocMeasures.chi_sq, min_score=5):

    word_fd = FreqDist()
    label_word_fd = ConditionalFreqDist()
    pos = 0
    neg = 0
    for review in posids:
        pos += 1
        if (pos != cutoff):
            for word in review['text'].split(' '):
                word_fd.update(token_helpers.tokenize_simple(word))
                label_word_fd['pos'].update(token_helpers.tokenize_simple(word))
 
    for review in negids:
        neg += 1
        if (neg != cutoff):
            for word in review['text'].split(' '):
                word_fd.update(token_helpers.tokenize_simple(word))
                label_word_fd['neg'].update(token_helpers.tokenize_simple(word))
    
    pos_word_count = label_word_fd['pos'].N()
    neg_word_count = label_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.items():
        pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score
    best = sorted(word_scores.items(), key=itemgetter(1), reverse=True)[:10000]
    bestwords = set([w for w, s in best])
    return bestwords
    
    """

Esempio n. 7

0

Mostra file

def buildCategoryDictionary(category):
    tweetList = twitter_fetch.get_tweets_text(classn=category)
    freq = FreqDist()
    for tweet in tweetList:
        freq.update(word for word in tokenizeTweet(tweet))
    saveDictionaryToFile(freq, category + categoryDictFilePath)
    return freq

Esempio n. 8

0

Mostra file

File: freq_representation.py Progetto: himanshusapra9/TextNet

def text_to_vector(docs, metric):
    """ Create frequency based feature-vector from text

    Metric must be either :attr:`FrequencyMetrics.TF` or :attr:`FrequencyMetrics.TF_IDF`.
    """
    doc_freqs = FreqDist() # Distribution over how many documents each word appear in.
    tf_dists = [] # List of TF distributions per document

    # Create freq_dist for each document
    for doc in docs:
        doc = preprocess.preprocess_text(doc)
        fd = FreqDist()
        for word in doc: fd.inc(word)
        doc_freqs.update(fd.samples())
        tf_dists.append(fd)


    all_tokens = doc_freqs.keys()
    num_docs = len(docs)
    num_features = len(all_tokens)


    # Build feature x document matrix
    matrix = np.zeros((num_features, num_docs))
    for i, fd in enumerate(tf_dists):
        if metric == FrequencyMetrics.TF:
            v = [fd.freq(word) for word in all_tokens]
        elif metric == FrequencyMetrics.TF_IDF:
            v = [fd.freq(word) * math.log(float(num_docs)/doc_freqs[word]) for word in all_tokens]
        else:
            raise ValueError("No such feature type: %s" % feature_type);
        matrix[:,i] = v

    return matrix

Esempio n. 9

0

Mostra file

def tokenize_data(sentences):
    # tokenize the dataset
    fdist = FreqDist()
    tokenized_sents = []
    for sentence in sentences:
        tokenized_sent = [w.lower() for w in word_tokenize(sentence)]
        tokenized_sents.append(tokenized_sent)
        fdist.update(tokenized_sent)

    # print("Number of word types in the tokenized data: ", len(fdist))
    return tokenized_sents

Esempio n. 10

0

Mostra file

File: script.py Progetto: shouyang/CMPUT497

def generate_freq_dist(samples):
    fdist = FreqDist()
    lemmatizer = WordNetLemmatizer()

    for sample in samples:
        temp = FreqDist([
            lemmatizer.lemmatize(word, "v")
            for sent in sent_tokenize(sample.text)
            for word in word_tokenize(sent)
        ])
        fdist.update(temp)
    return fdist

Esempio n. 11

0

Mostra file

def reduce_text(t1, t2):
    words = FreqDist(t1[0])
    words.update(t2[0])

    try:
        bigrams = FreqDist(t1[1])
        bigrams.update(t2[1])
    except:
        logger.error('problem in reducing..')
        logger.error('t1: %s' % str(t1))
        logger.error('t2: %s' % str(t2))

    return words, bigrams

Esempio n. 12

0

Mostra file

File: process_data_spark.py Progetto: ayat-rashad/eg_twitter

def reduce_text(t1, t2):
    words = FreqDist(t1[0])
    words.update(t2[0])

    try:
        bigrams = FreqDist(t1[1])
        bigrams.update(t2[1])
    except:
        logger.error('problem in reducing..')
        logger.error('t1: %s' % str(t1))
        logger.error('t2: %s' % str(t2))
    
    return words, bigrams

Esempio n. 13

0

Mostra file

def sentence_ngrams(sentence):
    print(str('pid:{} ||'.format(os.getpid())),
          time.strftime("%y-%m-%d_%H:%M:%S"))
    sentence = sentence.strip()
    if not sentence:
        raise ValueError('Empty sentence!')

    words = sentence.split(' ')
    ngrams_bag = FreqDist()
    for i in range(4):
        ngrams_bag.update(ngrams(words, i + 1))

    return ngrams_bag

Esempio n. 14

0

Mostra file

File: ranking.py Progetto: zzx88991/mocs

def cnc(phrase_lists,
        c_value_threshold=0,
        include_unigrams=False,
        weight_by_length=True):
    """given a list of phrases, run the cnc algorithm and return a dictionary of word, c-value (ranking) pairs"""
    frequency_dists_by_length = {}
    for phrase in phrase_lists:
        l = len(phrase)
        if l not in frequency_dists_by_length:
            frequency_dists_by_length[l] = FreqDist()
        frequency_dists_by_length[l].inc(tuple(phrase))

    # word -> C-value(word)
    phrase_scores = {}

    # word -> num occurrences(word)
    phrase_frequencies = FreqDist()

    # word -> (t(word), c(word))
    sub_phrase_scores = {}

    # traverse from longest phrases to shortest
    for length, frequency_dist in sorted(frequency_dists_by_length.items(), \
                                         key=lambda pair: pair[0], reverse=True):
        # update global frequency counts with all counts of this length
        phrase_frequencies.update(frequency_dist)
        # within each phrase length, traverse from most common phrases to least
        for phrase, frequency in frequency_dist.iteritems():
            if phrase in sub_phrase_scores:
                t, c = sub_phrase_scores[phrase]
                subtractive = 1.0 / c * t
            else:
                subtractive = 0
            if weight_by_length:
                if include_unigrams:
                    weight = log(length + 1, 2)
                else:
                    weight = log(length, 2)
            else:
                weight = 1
            c_value = weight * (frequency - subtractive)
            if c_value >= c_value_threshold:
                phrase_scores[phrase] = c_value
                for sub_phrase in utils.sub_lists(phrase):
                    if sub_phrase in sub_phrase_scores:
                        t, c = sub_phrase_scores[sub_phrase]
                    else:
                        t, c = 0, 0
                    sub_phrase_scores[sub_phrase] = t + frequency, c + 1
    return phrase_scores, phrase_frequencies

Esempio n. 15

0

Mostra file

async def gobaby(urls):
    """
    Подготовка-запуск асинхронного запроса по урлам, формирование итогового словаря слов
    :param urls:
    :return:
    """
    futures = [get_content(url) for url in
               urls]  # созадниае списка футур (функций, которые будут выполнены в асинхронном режиме)
    done, _ = await asyncio.wait(futures)  # Запуск ФУТУР
    result_dict = FreqDist()  # Словарь в котором будут собираться результаты
    for future in done:  # Если футура выполнена
        try:
            result_dict.update(give_me_my_dict(future.result()))
        except Exception as e:
            print('Ошибка вот такая', e)
    create_cloud(result_dict) #Создание облага слов

Esempio n. 16

0

Mostra file

File: analyze.py Progetto: dangoldin/lincoln-text-analysis

def analyze(data, out_dir):
    summary = {}
    freq = FreqDist()
    sentence_length = defaultdict(list)
    year_freq_dist = defaultdict(FreqDist)
    year_dist = defaultdict(int)
    year_month_dist = defaultdict(int)
    year_quarter_dist = defaultdict(int)

    has_date = no_date = sentences = words = 0

    for year, date_str, title, text in data:
        date = parsedate(date_str)
        logger.debug('%s -> %s' % (date_str, str(date)))
        freq.update(ngram_phrases(text,3))
        if date:
            # Since can't use strftime for years before 1900, we need to use isoformat
            year_str = date.isoformat()[:4]
            year_mo_str = date.isoformat()[:7]
            has_date += 1
        else:
            no_date += 1
            year_mo_str = ''

        if year_str:
            year_range = get_year_range(year_str)
            sentence_length[ year_range ].extend( sentence_lengths(text) )
            year_freq_dist[ year_range ].update( ngram_phrases(text,3) )
            year_dist[year] += 1

        if year_mo_str:
            year_month_dist[year_mo_str] += 1
            year_quarter_dist[ year_quarter(year_mo_str) ] += 1

        sentences += count_sentences(text)
        words += count_words(text)

    logger.debug('Documents with a valid date: %d Documents without a valid date: %d' % (has_date, no_date))
    logger.debug('Total # Sentences: %d' % sentences)
    logger.debug('Total $ Words: %d' % words)

    generate_dict_csv(['year', 'cnt'], year_dist, os.path.join(out_dir, 'year-data.csv'))
    generate_dict_csv(['yearmo', 'cnt'], year_month_dist, os.path.join(out_dir, 'year-mo-data.csv'))
    generate_dict_csv(['yearq', 'cnt'], year_quarter_dist, os.path.join(out_dir, 'year-quarter-data.csv'))
    generate_stream_js(year_freq_dist, os.path.join(out_dir, 'stream-data.json'))
    generate_cloud_csv(year_freq_dist, os.path.join(out_dir, 'year-phrase-data.csv'))
    generate_sentence_length_csv(sentence_length, os.path.join(out_dir, 'data-sentence-lengths.csv'))

Esempio n. 17

0

Mostra file

File: ranking.py Progetto: zzx88991/mocs

def cnc(phrase_lists, c_value_threshold=0, include_unigrams=False, weight_by_length=True):
    """given a list of phrases, run the cnc algorithm and return a dictionary of word, c-value (ranking) pairs"""
    frequency_dists_by_length = {}
    for phrase in phrase_lists:
        l = len(phrase)
        if l not in frequency_dists_by_length:
            frequency_dists_by_length[l] = FreqDist()
        frequency_dists_by_length[l].inc(tuple(phrase))

    # word -> C-value(word)
    phrase_scores = {}

    # word -> num occurrences(word)
    phrase_frequencies = FreqDist()

    # word -> (t(word), c(word))
    sub_phrase_scores = {}

    # traverse from longest phrases to shortest
    for length, frequency_dist in sorted(frequency_dists_by_length.items(), key=lambda pair: pair[0], reverse=True):
        # update global frequency counts with all counts of this length
        phrase_frequencies.update(frequency_dist)
        # within each phrase length, traverse from most common phrases to least
        for phrase, frequency in frequency_dist.iteritems():
            if phrase in sub_phrase_scores:
                t, c = sub_phrase_scores[phrase]
                subtractive = 1.0 / c * t
            else:
                subtractive = 0
            if weight_by_length:
                if include_unigrams:
                    weight = log(length + 1, 2)
                else:
                    weight = log(length, 2)
            else:
                weight = 1
            c_value = weight * (frequency - subtractive)
            if c_value >= c_value_threshold:
                phrase_scores[phrase] = c_value
                for sub_phrase in utils.sub_lists(phrase):
                    if sub_phrase in sub_phrase_scores:
                        t, c = sub_phrase_scores[sub_phrase]
                    else:
                        t, c = 0, 0
                    sub_phrase_scores[sub_phrase] = t + frequency, c + 1
    return phrase_scores, phrase_frequencies

Esempio n. 18

0

Mostra file

def updateCategoryDictionary(category):
    tweetList = twitter_fetch.get_new_tweets(classn=category)
    freq = FreqDist()
    tmpDict = FreqDist()

    for tweet in tweetList:
        freq.update(word for word in tokenizeTweet(tweet))

    try:
        oldDict = readDictionaryFromFile(category + categoryDictFilePath)
    except:
        newDict = buildCategoryDictionary(category)
        return newDict

    oldDict.update(freq)
    saveDictionaryToFile(oldDict, category + categoryDictFilePath)
    return oldDict

Esempio n. 19

0

Mostra file

def generate_lookup(ngrams: List[List[str]]):
    fdist = FreqDist()

    for entry in ngrams:
        fdist.update(list(entry))

    lookup = {}
    for ngram in fdist:        
        key =  ngram[:-1]
        word = ngram[-1]
        count = fdist[ngram]

        if key not in lookup:
            lookup[key] = {}
        
        lookup[key][word] = count
    return lookup

Esempio n. 20

0

Mostra file

File: textMining.py Progetto: elms1990/twitter-ml

def updateCategoryDictionary(category):
    tweetList = twitter_fetch.get_new_tweets(classn=category)
    freq = FreqDist()
    tmpDict = FreqDist()

    for tweet in tweetList:
        freq.update(word for word in tokenizeTweet(tweet))

    try:
        oldDict = readDictionaryFromFile(category + categoryDictFilePath)
    except:
        newDict = buildCategoryDictionary(category)
        return newDict

    oldDict.update(freq)
    saveDictionaryToFile(oldDict, category + categoryDictFilePath)
    return oldDict

Esempio n. 21

0

Mostra file

File: execution_path_detect.py Progetto: nieyiwen/Dlog

def standard_log_key(log_key_sequence_str):
    # 将日志键， 通过滑动窗口分为一个一个日志序列，这里将其分为4个日志键为一个序列
    tokens = log_key_sequence_str.split(' ')
    # 将日志键其变为int
    tokens = [int(i) for i in tokens]
    K = max(tokens)+1  # 日志键的种类个数
    # print("the tokens are:",tokens)
    bigramfdist_4 = FreqDist()
    bigrams_4 = ngrams(tokens, 4)
    # from nltk.util import ngrams
    # a = ['1', '2', '3', '4', '5']
    # b = ngrams(a, 2)
    # for i in b:
    #     print
    #     i
    # ('1', '2')
    # ('2', '3')
    # ('3', '4')
    # ('4', '5')
    bigramfdist_4.update(bigrams_4)
    print("the bigramfdsit_4 is:", list(bigramfdist_4.keys()))
    # we set the length of history logs as 3
    seq = np.array(list(bigramfdist_4.keys()))

    # print("the seq is:",seq)
    X, Y = seq[:, :3], seq[:, 3:4]
    # print(seq.shape)   # (253, 4)
    # print(X_normal.shape)  # (253, 3)
    # print(Y_normal.shape)  # (253, 1)
    X = np.reshape(X, (-1, 3, 1))
    # print(X_normal)
    # [[[6]
    #   [72]
    #   [6]]
    #
    #  [[72]
    #     [6]
    #     [6]]
    #  ...]
    # 将数字等比缩小，变为从0到1
    X = X / K
    # 将整型标签转为onehot
    num_classes = len(list(set(Y.T.tolist()[0]))) + 1 # num_classes指的是Y_normal的种类
    Y = keras.utils.to_categorical(Y)   # num_classes=num_classes
    return X, Y

Esempio n. 22

0

Mostra file

File: fextract_helper.py Progetto: rtaph/authorship-attribution

def char_ngram_stats(texts, corpus, order, include_lower=False):
    '''
    Find character n-grams in some texts.
    @param texts: List of texts
    @param corpus: The corpus that holds the texts
    @param order: The order of the n-grams to consider.
    @param include_lower: Whether to include list of lower-order n-grams in output
    @return: A tuple: First element is a list of all n-grams (only of
    given order) across all texts. Second element is a a matrix with a list of
    lists of 1-grams, 2-grams, ..., n-grams per text.
    ''' 
    
    all_char_ngrams = FreqDist()
    text_char_ngrams = [] # Char n-grams found in each text
    
    for text in texts:
        
        if not text.endswith(".txt"):
            continue
        
        empty = len(corpus.raw(text)) == 0 
    
        # One freq. dist per n
        text_ngrams = []
        for _ in range(order):
            text_ngrams.append(FreqDist())
            
        if not empty:
            text_str = corpus.raw(text).replace('\r','').replace('\n', ' ')
            
            if include_lower:
                for n in range(1, order+1):
                    char_ng = ngrams(text_str, n)
                    text_ngrams[n-1].update(char_ng)
                    if n == order:
                        all_char_ngrams.update(char_ng)
            else:
                char_ng = ngrams(text_str, order)
                text_ngrams[order-1].update(char_ng)
                all_char_ngrams.update(char_ng)
            
        text_char_ngrams.append(text_ngrams)
            
    return all_char_ngrams, text_char_ngrams

Esempio n. 23

0

Mostra file

File: Execution_Path_Anomaly.py Progetto: TheWaySoFar/DeepLog-1

def get_train(log_key_sequence_str):
    #     # we have the sequence of log keys
    #     seq = np.array(log_key_sequence)
    # divide the log sequence into 4 for every unit
    tokens = log_key_sequence_str.split(' ')
    for i in range(len(tokens)):
        tokens[i] = tokens[i].replace('E', '')
        tokens[i] = int(tokens[i])
    #     print("the tokens are:",tokens)
    bigramfdist_4 = FreqDist()
    bigrams_4 = ngrams(tokens, 4)

    bigramfdist_4.update(bigrams_4)
    # print("the bigramfdsit_4 is:",bigramfdist_4.keys())
    # we set the length of history logs as 3
    seq = np.array(list(bigramfdist_4.keys()))
    # print("the seq is:",seq)
    X, Y = seq[:, :3], seq[:, 3:4]

    return X, Y

Esempio n. 24

0

Mostra file

File: process_data_spark.py Progetto: ayat-rashad/eg_twitter

def reduce_tweets(t1, t2):
    tags = FreqDist(t1[0])
    tags.update(t2[0])
    
    words = FreqDist(t1[1])
    words.update(t2[1])
    
    places = FreqDist(t1[2])
    places.update(t2[2])
    
    bigrams = FreqDist(t1[3])
    bigrams.update(t2[3])
    
    return tags, words, places, bigrams

Esempio n. 25

0

Mostra file

def reduce_tweets(t1, t2):
    tags = FreqDist(t1[0])
    tags.update(t2[0])

    words = FreqDist(t1[1])
    words.update(t2[1])

    places = FreqDist(t1[2])
    places.update(t2[2])

    bigrams = FreqDist(t1[3])
    bigrams.update(t2[3])

    return tags, words, places, bigrams

Esempio n. 26

0

Mostra file

File: word_count.py Progetto: jrwalk/empath

def word_count(drug=None,limit=None,pos_filter=False,lemma=True):
	"""Scans comment texts (from drug_mentions.texts) for selected drug, 
	calculates most common words.

	KWARGS:
		drug: string or None.
			Drug selector.  Allows three cases:
			* None: scrape all comments in database, regardless of drug.
			* 'antidepressant': select comments speaking generically about
				drug, not referencing specific drug.
			* [drug name]: comments referencing specific drug.
			Default None.  Passed to drug_mentions.texts.
		limit: int or None.
			Optional limit on SQL queries retrieved by drug_mentions.texts. 
			Defaults to None (returns all hits).
		pos_filter: boolean.
			Passed to tokenize(), set True to use part-of-speech filtering.
		lemma: boolean.
			Passed to tokenize(), set True to use lemmatization.

	RETURNS:
		freq: nltk.probability.FreqDist object.
			Frequency distribution of words from comments.

	RAISES:
		ValueError:
			for invalid drug name.
	"""
	try:
		texts = dm.texts(drug=drug,limit=limit)
	except ValueError:
		raise ValueError('Invalid drug name.')

	freq = FreqDist()
	for text in texts:
		freq.update(tokenize(text,drug,pos_filter=pos_filter,lemma=lemma))

	return freq

Esempio n. 27

0

Mostra file

File: buildWordLangModel.py Progetto: jcavalieri8619/OCRerror_correct

def buildGoogleUnigram( ):
    DirPrefix = "/home/jcavalie/googleNgrams_unigrams/"

    unigramFiles = os.listdir( DirPrefix )

    unigramFiles = list( map( lambda _fileName: DirPrefix + _fileName, unigramFiles ) )

    masterUnigram = FreqDist( )

    with multiprocessing.Pool( 8, initializer = initProcess ) as ProcessPool:
        resAsync = ProcessPool.map_async( _buildUnigram, unigramFiles )
        results = resAsync.get( )

    ProcessPool.join( )

    print( "all jobs finished, building master unigram" )
    for freqdist in results:
        masterUnigram.update( freqdist )

    with open( "PickledData/GoogleUnigram.pickle", 'wb' ) as pklFile:
        pickle.dump( masterUnigram, pklFile, pickle.HIGHEST_PROTOCOL )

    return

Esempio n. 28

0

Mostra file

def text_to_vector(docs, metric):
    """ Create frequency based feature-vector from text

    Metric must be either :attr:`FrequencyMetrics.TF` or :attr:`FrequencyMetrics.TF_IDF`.
    """
    doc_freqs = FreqDist(
    )  # Distribution over how many documents each word appear in.
    tf_dists = []  # List of TF distributions per document

    # Create freq_dist for each document
    for doc in docs:
        doc = preprocess.preprocess_text(doc)
        fd = FreqDist()
        for word in doc:
            fd.inc(word)
        doc_freqs.update(fd.samples())
        tf_dists.append(fd)

    all_tokens = doc_freqs.keys()
    num_docs = len(docs)
    num_features = len(all_tokens)

    # Build feature x document matrix
    matrix = np.zeros((num_features, num_docs))
    for i, fd in enumerate(tf_dists):
        if metric == FrequencyMetrics.TF:
            v = [fd.freq(word) for word in all_tokens]
        elif metric == FrequencyMetrics.TF_IDF:
            v = [
                fd.freq(word) * math.log(float(num_docs) / doc_freqs[word])
                for word in all_tokens
            ]
        else:
            raise ValueError("No such feature type: %s" % feature_type)
        matrix[:, i] = v

    return matrix

Esempio n. 29

0

Mostra file

File: buildWordLangModel.py Progetto: jcavalieri8619/OCRerror_correct

def buildGoogleUnigram():
    DirPrefix = "/home/jcavalie/googleNgrams_unigrams/"

    unigramFiles = os.listdir(DirPrefix)

    unigramFiles = list(
        map(lambda _fileName: DirPrefix + _fileName, unigramFiles))

    masterUnigram = FreqDist()

    with multiprocessing.Pool(8, initializer=initProcess) as ProcessPool:
        resAsync = ProcessPool.map_async(_buildUnigram, unigramFiles)
        results = resAsync.get()

    ProcessPool.join()

    print("all jobs finished, building master unigram")
    for freqdist in results:
        masterUnigram.update(freqdist)

    with open("PickledData/GoogleUnigram.pickle", 'wb') as pklFile:
        pickle.dump(masterUnigram, pklFile, pickle.HIGHEST_PROTOCOL)

    return

Esempio n. 30

0

Mostra file

class AddAlphaBigramModel():
    def __init__(self, alpha=0.1):
        self.vocabulary=set()
        self.V = 0
        self.bigrams=ConditionalFreqDist([])
        self.unigrams=FreqDist([])
        self.alpha = 0.1
    def train(self):
        self.vocabulary=set()
        
        this_bigrams=[]
        self.unigrams = FreqDist([])
        
        for fileid in gutenberg.fileids():
            for sentence in gutenberg.sents(fileid):
                words=["<s>",] + [x.lower() for x in sentence if wordRE.search(x)] + ["</s>",]
                this_bigrams += bigrams(words)
                self.vocabulary.update(words)
                self.unigrams.update(words)
        self.bigrams=ConditionalFreqDist(this_bigrams)
        self.V = len(self.vocabulary)
        
    def bigram_prob(self, w1, w2):
        numerator = self.bigrams[w1][w2] + self.alpha
        denominator = self.bigrams[w1].N() + (self.alpha * self.V)
        retval= math.log(numerator / denominator)

        return retval

    def unigram_prob(self, w):
        numerator = self.unigrams[w] + self.alpha
        denominator = self.unigrams.N() + (self.alpha * self.V)
        return math.log(numerator/denominator)
    
    def __contains__(self, w):
        return w in self.vocabulary

Esempio n. 31

0

Mostra file

 def _create_vocabulary(self):
     """Analyze all the text sentences in the data set and create a vocabulary:
     1. The dataset vocabulary
     2. Number of words in the vocabulary
     3. Length of the longest sentence """
     frequencies = FreqDist()
     max_sentence_length = 0
     for idx in range(self.__len__()):
         txt_path = os.path.join(self.text_dir_path,
                                 self.images_df.iloc[idx].path + ".txt")
         with open(txt_path, "r") as f:
             for line in f:
                 tokens = [
                     token.lower()
                     for token in self.tokenizer.tokenize(line)
                 ]
                 if len(tokens) > max_sentence_length:
                     max_sentence_length = len(tokens)
                 frequencies.update(tokens)
     # Finally, create the vocabulary object from the torchtext library.
     vocabulary = Vocab(frequencies,
                        min_freq=2,
                        specials=["<unk>", "<eos>"])
     return vocabulary, len(vocabulary.itos), max_sentence_length

Esempio n. 32

0

Mostra file

File: preprocessing.py Progetto: pejotr/doc-clustering

def process_documents(path, html_conf):
    logging.info("Using documents from \"" + path + "\" directory ")
    
    if path[-1] != "/" :
        path + "/"

    documents = {}
    allterms  = {}
    listing   = os.listdir(path)
    allfreq   = FreqDist()

    # retriving document content - discarding structure
    logging.info("Processing files...")
    for infile in listing:
        logging.info("\tReading document " + infile)
        raw_doc     = open(path + infile, 'r').read()
        nonhtml_doc = nltk.clean_html(raw_doc)
        word_list   = nltk.word_tokenize(re.sub('[^A-Za-z0-9 ]', ' ', raw_doc))
        terms_list  = [ x.lower() for x in word_list if x.lower() not in stopwords.words('english')]

        stemmes = steming(terms_list)

        for stem in stemmes :
            allterms[stem] = 0

        fdist = FreqDist(word.lower() for word in stemmes)
        allfreq.update(word.lower() for word in stemmes)

        htmldist = evaluate_html(raw_doc.lower(), html_conf)
        fdist.update(htmldist)
        allfreq.update(htmldist)
    
        documents[infile] = { 'docname': infile,  'terms': stemmes, 'tf': fdist, 'tfidf': None  }

    for key, doc in documents.iteritems():
        doctfidf = compute_tfidf(doc ,documents)
        documents[key]['tfidf'] = dict(allterms.items() + doctfidf.items())

    return documents, allfreq

Esempio n. 33

0

Mostra file

File: bagofwords.py Progetto: gunbuster363/DataMining_university

f.close()
banset = set(stoplist)

count = 0
for hotel in wordlists.fileids():
 print hotel
 list1 = wordlists.words(hotel)
 list2 = []
 for w in list1:
  list2.append(w)
 list3 = [w.strip() for w in list2]

 if(count==0):
  fdict = FreqDist(list3)
 else:
  fdict.update(list3)

 count+=1
 print len(fdict)

fdict2=fdict.copy()

for w in fdict.keys()[:]:
 if w.strip() in banset or len(w.strip()) < 3 or len(w.strip()) > 25:
  del fdict2[w]
 elif isinstance(w, unicode):
  del fdict2[w]

for w in fdict2.keys():
 if len(w) < 3:
  print w, len(w)

Esempio n. 34

0

Mostra file

File: review_preprocessing.py Progetto: zuhlke/tg-machine-learning-camp-2018

                                         testsets['neutral'])
    classifier.show_most_informative_features()


def word_feats(words):
    return dict([(word, True) for word in words])


print 'evaluating single word features'
evaluate_classifier(word_feats)

word_fd = FreqDist()
label_word_fd = ConditionalFreqDist()

for word in tweets.words(categories=['pos']):
    word_fd.update([word.lower()])
    label_word_fd['pos'].update([word.lower()])

for word in tweets.words(categories=['neg']):
    word_fd.update([word.lower()])
    label_word_fd['neg'].update([word.lower()])

for word in tweets.words(categories=['neutral']):
    word_fd.update([word.lower()])
    label_word_fd['neutral'].update([word.lower()])

# n_ii = label_word_fd[label][word]
# n_ix = word_fd[word]
# n_xi = label_word_fd[label].N()
# n_xx = label_word_fd.N()

Esempio n. 35

0

Mostra file

    # featureBigramNeg.append(helperFuntions.bigramReturner(xNew))
    featureUnigramNeg.append(helperFuntions.getFeatureVector(xNew))
    # break

for x in dataPosTrain:
    # print x
    xNew = helperFuntions.removePunctuation(x)
    xNew = helperFuntions.toLower(xNew)
    xNew = helperFuntions.removeNumbers(xNew)
    xNew = helperFuntions.removeStopWords(xNew)
    # featureBigramPos.append(helperFuntions.bigramReturner(xNew))
    featureUnigramPos.append(helperFuntions.getFeatureVector(xNew))
    # break

for word in featureUnigramPos:
    word_fd.update(word)
    label_word_fd['pos'].update(word)

for word in featureUnigramNeg:
    word_fd.update(word)
    label_word_fd['neg'].update(word)

# print featureBigramPos
# print featureUnigramPos
pos_word_count = label_word_fd['pos'].N()
neg_word_count = label_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count

word_scores = {}

for word, freq in word_fd.iteritems():

Esempio n. 36

0

Mostra file

File: EditDistance.py Progetto: slee17/NLP

class EditDistanceFinder():  
    def __init__(self):
        self.char_probs = ConditionalProbDist([],MLEProbDist)
        self.bichar_freqs = ConditionalFreqDist([])
        self.transp_freqs = FreqDist()
        self.DOWN,self.LEFT,self.DIAG,self.DOUBLE_DIAG = range(4)
        self.INSERT, self.DELETE, self.SUBST, self.TRANSP = range(4)
        
    def train(self, fname):
        misspellings=[]
        for line in open(fname):
            line=line.strip()
            if not(line): continue
            w1, w2 = line.split(",")
            misspellings.append((w1.strip(),w2.strip()))
       
        last_alignments = None
        done = False
        while not done:
            print("Iteration")
            alignments, bigrams = self.train_alignments(misspellings)
            self.train_costs(alignments, bigrams)
            done = (alignments == last_alignments)
            last_alignments = alignments
            
    def train_alignments(self, misspellings):
        alignments = []
        self.bichar_freqs = FreqDist()

        for error, corrected in misspellings:
            distance, this_alignments = self.align(corrected, error)
            alignments += this_alignments
            bigrams = [corrected[i:i+2] for i in range(len(corrected)-1)]
            self.bichar_freqs.update(bigrams)
            
        return alignments,bigrams
    
    def train_costs(self, alignments,bigrams):
        add_one_aligns = [(a,b) for a in string.ascii_lowercase for b in string.ascii_lowercase]
        single_aligns = [(a,b) for a,b in alignments if len(a) < 2]
        
        char_aligns = ConditionalFreqDist(single_aligns + add_one_aligns)
        self.char_probs = ConditionalProbDist(char_aligns, MLEProbDist)
        
        double_aligns = [a for a,b in alignments if len(a) >= 2]
        self.transp_freqs = FreqDist(double_aligns)

    def align(self, w1, w2, verbose=False):
        M = len(w1) +1
        N = len(w2) +1
        table = numpy.zeros((M,N))
        backtrace = numpy.zeros((M,N))
    
        for i in range(1,M):
            w1_char = w1[i-1]
            table[i,0] = table[i-1,0] + self.del_cost(w1_char)
            backtrace[i,0] = self.DOWN
        for j in range(1,N):
            w2_char = w2[j-1]
            backtrace[0,j] = self.LEFT
            table[0,j] = table[0,j-1] + self.ins_cost(w2_char)   
    
        for i in range(1,M):
            w1_char = w1[i-1]
            for j in range(1,N):
                w2_char = w2[j-1]

                this_del = table[i-1,j] + self.del_cost(w1_char)
                this_ins = table[i,j-1] + self.ins_cost(w2_char)
                this_sub = table[i-1,j-1] + self.sub_cost(w1_char,w2_char)
                
                if j > 1 and i > 1 and w1[i-1] == w2[j-2] and w1[i-2]==w2[j-1] and w1[i-1] != w1[i-2]:
                    this_transp = table[i-2,j-2] + self.transp_cost(w1_char, w2_char)
                else:
                    this_transp = 999999
            
                min_cost = min(this_del, this_ins, this_sub, this_transp)
                table[i,j] = min_cost

                if this_sub == min_cost:
                    backtrace[i,j] = self.DIAG
                elif this_transp == min_cost:
                    backtrace[i,j] = self.DOUBLE_DIAG
                elif this_ins == min_cost:
                    backtrace[i,j] = self.LEFT
                else: # insert
                    backtrace[i,j] = self.DOWN

                
        alignments = []
        i = M - 1    
        j = N - 1
        while (j or i):
            this_backtrace = backtrace[i,j]
            if this_backtrace == self.DIAG: # sub
                alignments.append((w1[i-1],w2[j-1]))
                i -= 1
                j -= 1
            elif this_backtrace == self.DOUBLE_DIAG:
                alignments.append((w1[i-2:i],w2[j-2:j]))
                i -= 2
                j -= 2
            elif this_backtrace == self.DOWN: # delete
                alignments.append((w1[i-1],"%"))
                i -= 1
            elif this_backtrace == self.LEFT: # insert
                alignments.append(("%",w2[j-1]))
                j -= 1

        alignments.reverse()
        if verbose:
            print(table)
        return table[M-1,N-1], alignments

    def transp_cost(self, char1, char2):
        ## how often do char1 and char2 get transposed?
        return 1 - self.transp_prob(char1,char2)
   
    def del_cost(self, char):
        return 1-self.char_probs[char].prob('%')
    def ins_cost(self, char):
        return 1-self.char_probs['%'].prob(char)
    def sub_cost(self, char1, char2):
        return 1-self.char_probs[char1].prob(char2)
    
    def transp_prob(self, char1, char2):
        numerator = self.transp_freqs[char1] + .1
        denominator = self.bichar_freqs[char1] + .1*26*26
        return numerator / denominator
    
    def prob(self, w1, w2):
        score, alignment = self.align(w1, w2)
        total_prob = 0
        for a, b in alignment:
            if len(a) > 1:
                total_prob += log(self.transp_prob(a[0],a[1]))
            else:
                total_prob += self.char_probs[a].logprob(b)
        return total_prob
    
    def show_alignment(self, alignments):
        print("String1:", " ".join([x[0] for x in alignments]))
        print("String2:", " ".join([x[1] for x in alignments]))

Esempio n. 37

0

Mostra file

File: bagofwords_nl.py Progetto: gunbuster363/DataMining_university

fin.close()


count=0
for hotel in wordlists.fileids():
 list4 = []
 print hotel
 taglist = tagger.tag(wordlists.words(hotel))
 list1 = find_chunk('CHUNK: {<JJ.*> <RB>* <NN.*>+}')
 list2 = find_chunk2('CHUNK: {<NN.*>+ <VB.*> <RB>* <JJ.*>}')
 list3 = find_chunk3('CHUNK: {<VB.*> <RB>* <JJ.*> <NN.*>}')
 list4 = list1 + list2 + list3
 if(count==0):
  fdict = FreqDist(list4)
 else:
  fdict.update(list4)
 count+=1
 print 'Size of dictionary:',len(fdict)
 print ''

f=open('stoplist.txt', 'r')
stoplist=[]
ban='IV'
while(ban!=''):
 ban=f.readline()
 stoplist.append(ban.strip())

f.close()
banset = set(stoplist)

fdict2=fdict.copy()

Esempio n. 38

0

Mostra file

    def __init__(self, n, train, pad_left=False, pad_right=False,
                 estimator=None, *estimator_args, **estimator_kwargs):
        """
        Creates an ngram language model to capture patterns in n consecutive
        words of training text.  An estimator smooths the probabilities derived
        from the text and may allow generation of ngrams not seen during
        training.

        :param n: the order of the language model (ngram size)
        :type n: C{int}
        :param train: the training text
        :type train: C{iterable} of C{string} or C{iterable} of C{iterable} of C{string} 
        :param estimator: a function for generating a probability distribution---defaults to MLEProbDist
        :type estimator: a function that takes a C{ConditionalFreqDist} and
              returns a C{ConditionalProbDist}
        :param pad_left: whether to pad the left of each sentence with an (n-1)-gram of <s>
        :type pad_left: bool
        :param pad_right: whether to pad the right of each sentence with </s>
        :type pad_right: bool
        :param estimator_args: Extra arguments for estimator.
            These arguments are usually used to specify extra
            properties for the probability distributions of individual
            conditions, such as the number of bins they contain.
            Note: For backward-compatibility, if no arguments are specified, the
            number of bins in the underlying ConditionalFreqDist are passed to
            the estimator as an argument.
        :type estimator_args: (any)
        :param estimator_kwargs: Extra keyword arguments for the estimator
        :type estimator_kwargs: (any)
        """

        # protection from cryptic behavior for calling programs
        # that use the pre-2.0.2 interface
        assert(isinstance(pad_left, bool))
        assert(isinstance(pad_right, bool))

        # make sure n is greater than zero, otherwise print it
        assert (n > 0), n

        # For explicitness save the check whether this is a unigram model
        self.is_unigram_model = (n == 1)
        # save the ngram order number
        self._n = n
        # save left and right padding
        self._lpad = ('<s>',) * (n - 1) if pad_left else ()
        # Need _rpad even for unigrams or padded entropy will give
        #  wrong answer because '</s>' will be treated as unseen...
        self._rpad = ('</s>',) if pad_right else ()
        self._padLen = len(self._lpad)+len(self._rpad)

        self._N=0
        delta = 1+self._padLen-n        # len(sent)+delta == ngrams in sent

        if estimator is None:
            assert (estimator_args is ()) and (estimator_kwargs=={}),\
                   "estimator_args (%s) or _kwargs supplied (%s), but no estimator"%(estimator_args,estimator_kwargs)
            estimator = lambda fdist, bins: MLEProbDist(fdist)

        # Given backoff, a generator isn't acceptable
        if not isinstance(train,collections.abc.Sequence):
          train=list(train)
        self._W = len(train)
        # Coerce to list of list -- note that this means to train charGrams,
        #  requires exploding the words ahead of time 
        if train is not None:
            if isinstance(train[0], compat.string_types):
                train = [train]
                self._W=1
            elif not isinstance(train[0],collections.abc.Sequence):
                # if you mix strings and generators, you have only yourself
                #  to blame!
                for i in range(len(train)):
                    train[i]=list(train[i])

        if n == 1:
            if pad_right:
                sents=(chain(s,self._rpad) for s in train)
            else:
                sents=train
            fd=FreqDist()
            for s in sents:
                fd.update(s)
            if not estimator_args and not estimator_kwargs:
                self._model = estimator(fd,fd.B())
            else:
                self._model = estimator(fd,fd.B(),
                                        *estimator_args, **estimator_kwargs)
            self._N=fd.N()
        else:
            cfd = ConditionalFreqDist()
            self._ngrams = set()

            for sent in train:
                self._N+=len(sent)+delta
                for ngram in ingrams(chain(self._lpad, sent, self._rpad), n):
                    self._ngrams.add(ngram)
                    context = tuple(ngram[:-1])
                    token = ngram[-1]
                    cfd[context][token]+=1
            if not estimator_args and not estimator_kwargs:
                self._model = ConditionalProbDist(cfd, estimator, len(cfd))
            else:
                self._model = ConditionalProbDist(cfd, estimator, *estimator_args, **estimator_kwargs)

        # recursively construct the lower-order models
        if not self.is_unigram_model:
            self._backoff = NgramModel(n-1, train,
                                        pad_left, pad_right,
                                        estimator,
                                        *estimator_args,
                                        **estimator_kwargs)

            # Code below here in this method, and the _words_following and _alpha method, are from
            # http://www.nltk.org/_modules/nltk/model/ngram.html "Last updated on Feb 26, 2015"
            self._backoff_alphas = dict()
            # For each condition (or context)
            for ctxt in cfd.conditions():
                backoff_ctxt = ctxt[1:]
                backoff_total_pr = 0.0
                total_observed_pr = 0.0

                # this is the subset of words that we OBSERVED following
                # this context.
                # i.e. Count(word | context) > 0
                for word in self._words_following(ctxt, cfd):
                    total_observed_pr += self.prob(word, ctxt)
                    # we also need the total (n-1)-gram probability of
                    # words observed in this n-gram context
                    backoff_total_pr += self._backoff.prob(word, backoff_ctxt)
                if isclose(total_observed_pr,1.0):
                    total_observed_pr=1.0
                else:
                    assert 0.0 <= total_observed_pr <= 1.0,\
                           "sum of probs for %s out of bounds: %.10g"%(ctxt,total_observed_pr)
                # beta is the remaining probability weight after we factor out
                # the probability of observed words.
                # As a sanity check, both total_observed_pr and backoff_total_pr
                # must be GE 0, since probabilities are never negative
                beta = 1.0 - total_observed_pr

                if beta!=0.0:
                    assert (0.0 <= backoff_total_pr < 1.0), \
                           "sum of backoff probs for %s out of bounds: %s"%(ctxt,backoff_total_pr)
                    alpha_ctxt = beta / (1.0 - backoff_total_pr)
                else:
                    assert ((0.0 <= backoff_total_pr < 1.0) or
                            isclose(1.0,backoff_total_pr)), \
                           "sum of backoff probs for %s out of bounds: %s"%(ctxt,backoff_total_pr)
                    alpha_ctxt = 0.0

                self._backoff_alphas[ctxt] = alpha_ctxt

Esempio n. 39

0

Mostra file

class TwitterCorpus(object):
    def __init__(self, args):
        self.dictionary = Dictionary()
        self.dictionary.add_word("<<<padding>>>")
        self.padding_value = self.dictionary.word2idx["<<<padding>>>"]
        self.max_vocab_size = args.max_vocab_size

        self.fdist = FreqDist()
        self.file_prepared = False
        self.username_re = re.compile("\@[\w]+")
        self.url_re = re.compile("http[s]?://[\w|\.|\?|\/]+")
        self.www_re = re.compile("www.[^ ]+")
        self.emoticon_re = re.compile(
            "(;D)|(:D)|(:/)|(=\))|(:-D)|(;-D)|(:\()|(=\()|(:\s{1}\()")
        self.run_on_re = re.compile(r"(\w)\1{2,}", re.DOTALL)
        self.negations_dic = {
            "isn't": "is not",
            "aren't": "are not",
            "wasn't": "was not",
            "weren't": "were not",
            "haven't": "have not",
            "hasn't": "has not",
            "hadn't": "had not",
            "won't": "will not",
            "wouldn't": "would not",
            "don't": "do not",
            "doesn't": "does not",
            "didn't": "did not",
            "can't": "can not",
            "couldn't": "could not",
            "shouldn't": "should not",
            "mightn't": "might not",
            "mustn't": "must not"
        }
        self.neg_pattern = re.compile(r'\b(' +
                                      '|'.join(self.negations_dic.keys()) +
                                      r')\b')

        self.datafile = os.path.join(args.data, "tweet_data.h5")
        self.data_handle = h5py.File(os.path.join(args.data, "tweet_data.h5"),
                                     'w')

        self.prepare_dataset(args.training, 'training')
        self.prepare_dataset(args.testing, 'testing')
        self.data_handle.close()
        self.file_prepared = True

    def __getstate__(self):
        ''' Do not pickle the handle to the h5 file '''
        state = self.__dict__.copy()
        del state['data_handle']
        return state

    def __setstate__(self, state):

        self.__dict__.update(state)
        if os.path.exists(self.datafile):
            self.file_prepared = True
        else:
            self.file_prepared = False

    def get_padding_idx(self):
        return self.padding_value

    def get_data_file(self):
        if self.file_prepared:
            return self.datafile
        else:
            print(
                'File is not prepared.  Re-build TwitterCorpus object properly.',
                file=sys.stderr)

    def prepare_dataset(self, path, data_split):
        """ Preprocess the dataset in `path` 
        data_split \in ['training','testing'] """

        outpath = path.replace(".csv", ".prepared.csv")
        self._make_freqdist(path)
        tokens, max_len, num_tweets = self._preprocess_and_build_dictionary(
            path, outpath)
        self._pack_to_h5(outpath, data_split, tokens, max_len, num_tweets)

    def _process_tweet(self, tweet):
        """ Apply feature transformations to each tweet in the dataset """

        # unique tokens with this, no depunct: 755992
        # removing single char tokens, expanding contractions: 277990
        # target vocab should be 76643, size reported in Kalchbrenner & Gref & Blunsom

        tweet = tweet.strip()
        tweet = BeautifulSoup(tweet, 'lxml').get_text()
        tweet = tweet.replace(u"\ufffd", "?")
        # @usernames -> USERNAME
        tweet = re.sub(self.username_re, lambda x: "USERNAME", tweet)
        # URLS -> URL
        tweet = re.sub(self.url_re, lambda x: "URL", tweet)
        # www. URLs -> URL
        tweet = re.sub(self.www_re, lambda x: "URL", tweet)
        # expand negation contractions
        tweet = re.sub(self.neg_pattern,
                       lambda x: self.negations_dic[x.group()], tweet)

        # standardize emoticons
        tweet = re.sub(self.emoticon_re, lambda x: "", tweet)
        # shrink extended runs of any char
        tweet = re.sub(self.run_on_re, r"\1\1",
                       tweet)  # result = re.sub("(\d+) (\w+)", r"\2 \1")

        return tweet

    def _make_freqdist(self, path):
        """ Read all the tweets, calculate the frequencies of 
        the words appearing in processed tweets """

        translator = str.maketrans('', '', string.punctuation)
        print("Counting words in training...")
        with open(path, 'r', encoding='utf-8', errors='replace') as f:
            tweet_reader = csv.reader(f, delimiter=',', quotechar='"')
            for i, parts in enumerate(tweet_reader):
                tweet = parts[-1]
                clean_tweet = self._process_tweet(tweet)
                lc_clean_tweet = clean_tweet.lower()
                words = [
                    w for w in lc_clean_tweet.translate(translator).split()
                    if len(w) > 1
                ]
                self.fdist.update(words)
                if i % 10000 == 0:
                    print("Processed first ", i, "tweets")

    def _preprocess_and_build_dictionary(self, inpath, outpath, depunct=True):
        """ Preprocess the Twitter Sentiment data set in `inpath`,
        build the dictionary, and write the sanitized output to 
        `outpath`.  In addition, return how many unique tokens
        we see in the corpus. 
        
        return the number of unique tokens seen, the largest number
        of words seen in a single tweet, and the number of tweets 
        in this file.
        """

        assert os.path.exists(inpath)
        if depunct:
            translator = str.maketrans('', '', string.punctuation)

        with open(inpath, 'r', encoding='utf-8-sig',
                  errors='replace') as in_f, open(outpath,
                                                  'w',
                                                  encoding='utf-8') as out_f:
            tweet_reader = csv.reader(in_f, delimiter=',', quotechar='"')
            tweet_writer = csv.writer(out_f, delimiter=',', quotechar='"')
            max_len = 0
            vocab_words = frozenset(
                [w for w, c in self.fdist.most_common(self.max_vocab_size)])

            for i, parts in enumerate(tweet_reader):
                if (i % 10000) == 0:
                    print("Finished tweet ", i)
                tweet = parts[-1]
                clean_tweet = self._process_tweet(tweet)
                lc_clean_tweet = clean_tweet.lower()
                words = [
                    w for w in lc_clean_tweet.translate(translator).split()
                    if len(w) > 1 and w in vocab_words
                ]
                max_len = len(words) if len(words) > max_len else max_len

                for word in words:
                    self.dictionary.add_word(word)

                clean_line = parts[:-1] + [" ".join(words)]
                tweet_writer.writerow(clean_line)

        unique_tokens = len(self.dictionary)
        return unique_tokens, max_len, i + 1

    def _tweet_to_list(self, parts, max_len):
        label, tweet = parts[0], parts[-1]

        try:
            label = int(label)
        except ValueError:
            print('Cannot coerce ', label, ' to int ')
            label = -1
        words = tweet.split()
        encoded_words = [self.dictionary.word2idx[word] for word in words]
        encoded_words = encoded_words + [
            self.padding_value for i in range(max_len - len(words))
        ]
        assert (len(encoded_words) == max_len)
        return encoded_words, label

    def _calculate_amount_to_write(self, chunk, chunk_size, num_examples):
        amount_to_write = num_examples - (chunk * chunk_size)
        if amount_to_write < 0:
            amount_to_write = num_examples
        if amount_to_write < chunk_size:
            return amount_to_write
        else:
            return chunk_size

    def _pack_to_h5(self, path, group, tokens, max_len, num_examples):
        """ Build the word2idx data structure for the Twitter Sentiment data set in `path` 
        I'll use an hdf5 file to store the embedded seqs, labels.
        
        path := path to cleaned up tweet file
        tokens := number of tokens to encode
        group := 'training' or 'testing', which group in the h5 file
                do we encode the data from `path`
        max_len := most number of words observed in a tweet
        num_examples := number of tweets in this file in `path`
        """

        assert os.path.exists(path)

        # create groups for data, labels
        group_name = '/' + group
        this_group = self.data_handle.create_group(group_name)
        data_name = group + "_data"
        label_name = group + "_labels"

        chunk = 0
        chunk_size = 10000
        buffer_size = self._calculate_amount_to_write(chunk, chunk_size,
                                                      num_examples)

        data = this_group.create_dataset(data_name,
                                         shape=(num_examples, max_len),
                                         chunks=(buffer_size, max_len),
                                         dtype=np.int32)
        labels = this_group.create_dataset(label_name,
                                           shape=(num_examples, 1),
                                           dtype=np.int32)

        # parse, encode words in each tweet, write to h5file
        temp_array = np.empty((chunk_size, max_len), dtype=np.int32)
        temp_labels = np.empty((chunk_size, 1), dtype=np.int32)

        with open(path, 'r', encoding='utf-8-sig', errors='replace') as f:
            ids = torch.LongTensor(tokens)
            token = 0
            tweet_reader = csv.reader(f, delimiter=',', quotechar='"')
            for i, parts in enumerate(tweet_reader):
                embedded_list, label = self._tweet_to_list(parts, max_len)
                temp_array[i % chunk_size, :] = np.array(embedded_list)
                temp_labels[i % chunk_size, 0] = label
                if (i + 1) % buffer_size == 0:
                    # write the buffer to the h5file
                    data[chunk * chunk_size:chunk * chunk_size +
                         buffer_size, :] = temp_array[0:buffer_size, :]
                    labels[chunk * chunk_size:chunk * chunk_size + buffer_size,
                           0] = temp_labels[0:buffer_size, 0]
                    chunk += 1
                    buffer_size = self._calculate_amount_to_write(
                        chunk, chunk_size, num_examples)

Esempio n. 40

0

Mostra file

        threestars.append(review)
    if stars[i] == 2:
        twostars.append(review)
    if stars[i] == 1:
        onestars.append(review)
    i = i + 1

word_fd = FreqDist()
label_word_fd = ConditionalFreqDist()

print 'Getting words...'
for review in fivestars:
    if type(review) is str:
        for word in review.split():
            if word not in stop:
                word_fd.update(stemmer.stem(word.decode('utf-8')).lower())
                label_word_fd['5'].update(
                    stemmer.stem(word.decode('utf-8')).lower())

for review in fourstars:
    if type(review) is str:
        for word in review.split():
            word_fd.update(stemmer.stem(word.decode('utf-8')).lower())
            label_word_fd['4'].update(
                stemmer.stem(word.decode('utf-8')).lower())

for review in threestars:
    if type(review) is str:
        for word in review.split():
            word_fd.update(stemmer.stem(word.decode('utf-8')).lower())
            label_word_fd['3'].update(

Esempio n. 41

0

Mostra file

class EditDistanceFinder():
    def __init__(self):
        self.char_probs = ConditionalProbDist([], MLEProbDist)
        self.bichar_freqs = ConditionalFreqDist([])
        self.transp_freqs = FreqDist()
        self.DOWN, self.LEFT, self.DIAG, self.DOUBLE_DIAG = range(4)
        self.INSERT, self.DELETE, self.SUBST, self.TRANSP = range(4)

    def train(self, fname):
        misspellings = []
        for line in open(fname):
            line = line.strip()
            if not (line): continue
            w1, w2 = line.split(",")
            misspellings.append((w1.strip(), w2.strip()))

        last_alignments = None
        done = False
        while not done:
            print("Iteration")
            alignments, bigrams = self.train_alignments(misspellings)
            self.train_costs(alignments, bigrams)
            done = (alignments == last_alignments)
            last_alignments = alignments

    def train_alignments(self, misspellings):
        alignments = []
        self.bichar_freqs = FreqDist()

        for error, corrected in misspellings:
            distance, this_alignments = self.align(corrected, error)
            alignments += this_alignments
            bigrams = [corrected[i:i + 2] for i in range(len(corrected) - 1)]
            self.bichar_freqs.update(bigrams)

        return alignments, bigrams

    def train_costs(self, alignments, bigrams):
        add_one_aligns = [(a, b) for a in string.ascii_lowercase
                          for b in string.ascii_lowercase]
        single_aligns = [(a, b) for a, b in alignments if len(a) < 2]

        char_aligns = ConditionalFreqDist(single_aligns + add_one_aligns)
        self.char_probs = ConditionalProbDist(char_aligns, MLEProbDist)

        double_aligns = [a for a, b in alignments if len(a) >= 2]
        self.transp_freqs = FreqDist(double_aligns)

    def align(self, w1, w2, verbose=False):
        M = len(w1) + 1
        N = len(w2) + 1
        table = numpy.zeros((M, N))
        backtrace = numpy.zeros((M, N))

        for i in range(1, M):
            w1_char = w1[i - 1]
            table[i, 0] = table[i - 1, 0] + self.del_cost(w1_char)
            backtrace[i, 0] = self.DOWN
        for j in range(1, N):
            w2_char = w2[j - 1]
            backtrace[0, j] = self.LEFT
            table[0, j] = table[0, j - 1] + self.ins_cost(w2_char)

        for i in range(1, M):
            w1_char = w1[i - 1]
            for j in range(1, N):
                w2_char = w2[j - 1]

                this_del = table[i - 1, j] + self.del_cost(w1_char)
                this_ins = table[i, j - 1] + self.ins_cost(w2_char)
                this_sub = table[i - 1, j - 1] + self.sub_cost(
                    w1_char, w2_char)

                if j > 1 and i > 1 and w1[i - 1] == w2[j - 2] and w1[
                        i - 2] == w2[j - 1] and w1[i - 1] != w1[i - 2]:
                    this_transp = table[i - 2, j - 2] + self.transp_cost(
                        w1_char, w2_char)
                else:
                    this_transp = 999999

                min_cost = min(this_del, this_ins, this_sub, this_transp)
                table[i, j] = min_cost

                if this_sub == min_cost:
                    backtrace[i, j] = self.DIAG
                elif this_transp == min_cost:
                    backtrace[i, j] = self.DOUBLE_DIAG
                elif this_ins == min_cost:
                    backtrace[i, j] = self.LEFT
                else:  # insert
                    backtrace[i, j] = self.DOWN

        alignments = []
        i = M - 1
        j = N - 1
        while (j or i):
            this_backtrace = backtrace[i, j]
            if this_backtrace == self.DIAG:  # sub
                alignments.append((w1[i - 1], w2[j - 1]))
                i -= 1
                j -= 1
            elif this_backtrace == self.DOUBLE_DIAG:
                alignments.append((w1[i - 2:i], w2[j - 2:j]))
                i -= 2
                j -= 2
            elif this_backtrace == self.DOWN:  # delete
                alignments.append((w1[i - 1], "%"))
                i -= 1
            elif this_backtrace == self.LEFT:  # insert
                alignments.append(("%", w2[j - 1]))
                j -= 1

        alignments.reverse()
        if verbose:
            print(table)
        return table[M - 1, N - 1], alignments

    def transp_cost(self, char1, char2):
        ## how often do char1 and char2 get transposed?
        return 1 - self.transp_prob(char1, char2)

    def del_cost(self, char):
        return 1 - self.char_probs[char].prob('%')

    def ins_cost(self, char):
        return 1 - self.char_probs['%'].prob(char)

    def sub_cost(self, char1, char2):
        return 1 - self.char_probs[char1].prob(char2)

    def transp_prob(self, char1, char2):
        numerator = self.transp_freqs[char1] + .1
        denominator = self.bichar_freqs[char1] + .1 * 26 * 26
        return numerator / denominator

    def prob(self, w1, w2):
        score, alignment = self.align(w1, w2)
        total_prob = 0
        for a, b in alignment:
            if len(a) > 1:
                total_prob += log(self.transp_prob(a[0], a[1]))
            else:
                total_prob += self.char_probs[a].logprob(b)
        return total_prob

    def show_alignment(self, alignments):
        print("String1:", " ".join([x[0] for x in alignments]))
        print("String2:", " ".join([x[1] for x in alignments]))

Esempio n. 42

0

Mostra file

                  'THEREFORE', 'THEY', 'THEY\'D', 'THEY\'LL', 'THEY\'RE',
                  'THIRD', 'THIRTEEN', 'THIRTEENTH', 'THIRTIETH', 'THIRTY',
                  'THIS', 'THITHER', 'THOSE', 'THOUGH', 'THOUSAND',
                  'THOUSANDTH', 'THREE', 'THRICE', 'THROUGH', 'THUS', 'TILL',
                  'TO', 'TOWARDS', 'TODAY', 'TOMORROW', 'TOO', 'TWELFTH',
                  'TWELVE', 'TWENTIETH', 'TWENTY', 'TWICE', 'TWO', 'UNDER',
                  'UNDERNEATH', 'UNLESS', 'UNTIL', 'UP', 'US', 'VERY', 'WHEN',
                  'WAS', 'WASN\'T', 'WE', 'WE\'D', 'WE\'LL', 'WERE', 'WE\'RE',
                  'WEREN\'T', 'WE\'VE', 'WHAT', 'WHENCE', 'WHERE', 'WHEREAS',
                  'WHICH', 'WHILE', 'WHITHER', 'WHO', 'WHOM', 'WHOSE', 'WHY',
                  'WILL', 'WITH', 'WITHIN', 'WITHOUT', 'WON\'T', 'WOULD',
                  'WOULDN\'T', 'YES', 'YESTERDAY', 'YET', 'YOU', 'YOUR',
                  'YOU\'D', 'YOU\'LL', 'YOU\'RE', 'YOURS', 'YOURSELF',
                  'YOURSELVES', 'YOU\'VE']

files = ['IN', 'IP', 'LY', 'NA', 'OP', 'SP']
stop_words = set([word.lower() for word in function_words])

for file in files:
    with open(file + '.txt', 'w') as my_file:
        for each in glob('Mini-CORE/1+' + file + '*.txt'):
#     for each in glob('Mini-CORE/1+', files, '*.txt'):
            with open(each, 'r') as read_file:
                fd = FreqDist()
                text = read_file.read().lower()
                cleaned_text = clean(text)
                tokens = nltk.word_tokenize(cleaned_text)
                words = [token for token in tokens if token not in stop_words]
                tokens_fd = FreqDist(words)
                fd.update(tokens_fd)
        print(fd.most_common(), file=my_file)