Ejemplo n.º 1
0
def create_logistic_model(df, _story_ids, data):
    results = []
    lr_models = {}
    for label in set(labels):
        positive_story_ids = set(df >> sift(X["labels"] == label) >> X.story_id.values)
        y_ = np.array([s in positive_story_ids for s in _story_ids])
        X_ = data
        lr = linear_model.LogisticRegression(C=C_VALUE)
        logging.info(label, Counter(y_))

        cv_score = cross_validation.cross_val_score(
            lr, X_, y_, cv=10, scoring="roc_auc").mean()

        lr = lr.fit(X_, y_)
        lr_models[label] = lr
        probs = lr.predict_proba(X_)[:, 1]
        results.append({"alg": "log reg", "label": label, "auc": cv_score})
        logging.info(C_VALUE, label, cv_score, len(probs[probs > 0.19]), Counter(labels == label))
        logging.info()
    results_df = pd.DataFrame(results)

    lr_fname = make_time_filename(LOGISTIC_MODEL_NAME, ".pkl")
    logging.info("writing file", lr_fname)
    with open(lr_fname, "wb") as f:
        pickle.dump(lr_models, f, protocol=2)
    def get_features(self, question, context):
        stop = set() if self.stop_words is None else self.stop_words.words
        context_features = np.zeros((len(context), 3))

        if not self.require_unique_match:
            question_words = set(x for x in question if x.lower() not in stop)
            quesiton_words_lower = set(x.lower() for x in question)
            quesiton_words_stem = set(
                self.lemmatize_word(x) for x in quesiton_words_lower)
        else:
            question_words = set(k for k, v in Counter(question).items()
                                 if v == 1)
            quesiton_words_lower = set(k for k, v in Counter(
                x.lower() for x in question_words).items() if v == 1)
            quesiton_words_stem = set(k for k, v in Counter(
                self.lemmatize_word(x) for x in quesiton_words_lower).items()
                                      if v == 1)

        for i, word in enumerate(context):
            if word in question_words:
                context_features[i][:3] = 1
            elif word.lower() in quesiton_words_lower:
                context_features[i][:2] = 1
            elif self._lemmatizer.lemmatize(word) in quesiton_words_stem:
                context_features[i][2] = 1

        if self.empty_question_features:
            return np.zeros((len(question), 3)), context_features
        else:
            return np.zeros((len(question), 0)), context_features
Ejemplo n.º 3
0
def llt(query):
    print(
        colored(
            "*****************************************************************************************"
            "********************************************************************************************",
            color='magenta'))
    public_tweets = get_tweets(query)
    location = {}
    language = {}
    time_zone = {}
    for tweet in public_tweets['statuses']:
        loc = tweet['user']['location']
        lang = tweet['user']['lang']
        tz = tweet['user']['time_zone']

        if loc in location:
            location[loc] += 1
        else:
            location[loc] = 1
        if lang in language:
            language[lang] += 1
        else:
            language[lang] = 1
        if tz in time_zone:
            time_zone[tz] += 1
        else:
            time_zone[tz] = 1
    if None in time_zone:
        del time_zone[None]
    if '' in time_zone:
        del time_zone['']
    if '' in language:
        del language['']
    if '' in location:
        del location['']
    if None in location:
        del location[None]
    if None in language:
        del language[None]
    language_count = dict(Counter(language).most_common(4))
    print(colored("language: ", color='green', attrs=['bold']))
    print language_count
    location_count = dict(Counter(location).most_common(4))
    print(colored("locations: ", color='green', attrs=['bold']))
    print location_count
    time_zone_count = dict(Counter(time_zone).most_common(4))
    print(colored("Time Zone: ", color='green', attrs=['bold']))
    print time_zone_count
    print(
        colored(
            "*****************************************************************************************"
            "********************************************************************************************",
            color='magenta'))
Ejemplo n.º 4
0
def llt(query):

    public_tweets = get_tweets(query)

    global time_zone1, loca, lang
    location = {}
    language = {}
    time_zone = {}
    for tweet in public_tweets['statuses']:
        loca = tweet['user']['location']
        lang = tweet['user']['lang']
        time_zone1 = tweet['user']['created_at']
        if loca in location:
            location[loca] += 1
        else:
            location[loca] = 1
        if lang in language:
            language[lang] += 1
        else:
            language[lang] = 1
        if time_zone1 in time_zone:
            time_zone[time_zone1] += 1
        else:
            time_zone[time_zone1] = 1

    # limiting the display of the values
    if None in time_zone:
        del time_zone[None]
    if '' in time_zone:
        del time_zone['']
    if '' in language:
        del language['']
    if '' in location:
        del location['']
    if None in location:
        del location[None]
    if None in language:
        del language[None]

    language_count = dict(Counter(language).most_common(5))
    print(colored("Language: ", color='green', attrs=['bold']))
    print(language_count)

    location_count = dict(Counter(location).most_common(5))
    print(colored("Location: ", color='green', attrs=['bold']))
    print(location_count)
    time_zone_count = dict(Counter(time_zone).most_common(5))
    print(colored("Time Zone: ", color='green', attrs=['bold']))
    print(time_zone_count)
Ejemplo n.º 5
0
def process_document(self, documents):
    tokenizer = data.load('tokenizers/punkt/english.pickle')
    lemmatizer = WordNetLemmatizer()
    stopwords = corpus.stopwords.words('english')
    tf = []  # term frequency
    idf = []
    tokens_list_doc_wise = []
    all_tokens = set()

    for document in documents:
        tokens = tokenizer.tokenize(document)
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
        tokens = [token for token in tokens if token not in stopwords]
        tokens_list_doc_wise.append(tokens)
        tf.append(Counter(tokens))
        all_tokens.union(tokens)

    # calculating idf
    for token in all_tokens:
        present_in_documents = 0
        for x in range(0, len(documents)):
            present_in_documents += 1 if tf[x][
                token] > 0 else present_in_documents
        idf[token] = math.log(len(documents) / len(present_in_documents))

    # calculating tf_idf for tokens document wise
    tf_idf = []
    for x in range(0, len(tokens_list_doc_wise)):
        for token in tokens_list_doc_wise[x]:
            tf_idf[token] = tf[x][token] * idf[token]
    def train(self, trainset):
        X = []
        y = []
        self.tfidf = TfidfVectorizer()
        for sent in trainset:
            self.tfidfWordList.append(sent['target_word'])
            for item in sent['target_word'].split(" "):
                self.wordList.append(item)
        #tf model
        self.tfList = Counter(self.wordList)
        max = np.array([item for item in self.tfList.values()])
        self.maxNumber = np.max(max)
        #tfidf model
        weightTfidf = self.tfidf.fit_transform(self.tfidfWordList).toarray()

        zeroVector = np.zeros(len(weightTfidf[0]))
        for item in weightTfidf:
            itemVector = np.array(item)
            zeroVector += itemVector
        self.tfidfResult = dict(zip(self.tfidf.get_feature_names(),
                                    zeroVector))
        # self.tfidfResult = {key:value for key,value in self.tfidf.vocabulary_.items()}
        # self.normal = np.max(np.array([item for item in self.tfidfResult.values()]))
        # print(self.tfList)
        for sent in trainset:
            X.append(self.extract_features(sent['target_word']))
            y.append(sent['gold_label'])
        self.model.fit(X, y)
        title = "TF+TFIDF " + self.language.capitalize()
        self.plot_learning_curve(self.model, title, X, y)
Ejemplo n.º 7
0
def modified_precision(candidate, references, n):
    candidate_counter = Counter(get_ngrams(candidate, n))
    if not candidate_counter:
        return 0

    max_reference_counter = {}
    for reference in references:
        reference_counter = Counter(get_ngrams(reference, n))
        for ngram in candidate_counter:
            max_reference_counter[ngram] = max(
                max_reference_counter.get(ngram, 0), reference_counter[ngram])

    clipped_counter = dict(
        (ngram, min(reference_counter_count, max_reference_counter[ngram]))
        for ngram, reference_counter_count in reference_counter.items())
    return sum(clipped_counter.values()) / sum(candidate_counter.values())
Ejemplo n.º 8
0
def td_idf(text_array, neg_word_occ, pos_word_occ):
    for i in text_array:
        word_map = Counter(i)
        for wd in word_map:
            word_map[wd] = log(len(text_array) / (neg_word_occ[wd] + pos_word_occ[wd]), 10) * word_map[wd] / len(i)
        print(word_map)
        break
Ejemplo n.º 9
0
def countWords(self):
    filteredWords = cleanWords(self.contents)
    counts = Counter(filteredWords)
    wordFreq = dict()
    # Convert to relative frequency
    for c in counts:
        wordFreq[c] = counts[c] / len(filteredWords)
    return wordFreq
Ejemplo n.º 10
0
def td_idf_to_vec(dataset, dim, neg_word_occ, pos_word_occ):
    result = zeros((len(dataset), dim))
    for i, val in enumerate(dataset):
        word_map = Counter(val)
        for wd in val:
            print(word_map)
            result[i][wd] = log(len(dataset) / (neg_word_occ[wd] + pos_word_occ[wd]), 10) * word_map[wd] / len(val)
    return result
Ejemplo n.º 11
0
def ngrams_over_file(sourcefile, n, targetfile):
    f = open(targetfile, 'w')
    data_set = read_file(sourcefile)
    split_it = data_set.split()
    split_it = list(filter(lambda x: x not in stopwords, split_it))
    split_it = list(filter(lambda x: len(x) > 2, split_it))
    counter = Counter(split_it)
    if n > 1:
        grams = ngrams(split_it, n)
        counter = Counter(grams)
    most_occur = counter.most_common(20)
    for occur in most_occur:
        if n == 1:
            f.write(occur[0] + "|" + str(occur[1]) + "\n")
        else:
            f.write(' '.join(map(str, occur[0])) + "|" + str(occur[1]) + "\n")
    print(most_occur)
    f.close()
Ejemplo n.º 12
0
 def getsimiliar(self, word, text):
     T = Text(text)
     word_context_index = ContextIndex(T.tokens,
                                       filter=lambda x: x.isalpha(),
                                       key=lambda s: s.lower())
     word = word.lower()
     wci = word_context_index._word_to_contexts
     words = []
     if word in wci.conditions():
         contexts = set(wci[word])
         fd = Counter(w for w in wci.conditions() for c in wci[w]
                      if c in contexts and not w == word)
         words = [w for w, _ in fd.most_common(20)]
     return words
Ejemplo n.º 13
0
def cleaning(sentence: str,
             method: str,
             frequency: int = 0,
             percentage: int = 0):
    """
    :param sentence: Definition to clean
    :param method: string which define which method to call
    :param frequency: if not None define minimum number of words repetition
    :param percentage: percentage of the highest frequent words to take
    :return Counter(key=word,value=frequency): sentence cleaned
    """
    tokenized: Counter = rm_stopwords_punctuation(sentence)
    tokenized = utility.remove_number_key(tokenized,
                                          minimum=1950,
                                          maximum=2030)
    if len(tokenized) <= 0:
        return Counter()
    elif frequency > 0:
        # Filtering only words with at least frequency occurrences
        filtered = dict(filter(lambda x: x[1] >= frequency, tokenized.items()))
        i = 1
        while len(filtered) <= 0:
            filtered = dict(
                filter(lambda x: x[1] >= frequency - i, tokenized.items()))
            i += 1
        return globals()[method](Counter(filtered))
    # If a percentage is defined take the first elements (based on percentage), otherwise take everything
    elif percentage > 0:
        percentage = int((percentage / 100) * len(tokenized))
        most_common = tokenized.most_common(percentage)
        tokenized = Counter(
            dict(
                filter(lambda elem: elem[0] in dict(most_common).keys(),
                       tokenized.items())))

    return globals()[method](tokenized)
Ejemplo n.º 14
0
def rm_stopwords_punctuation(sentence: str,
                             language="english",
                             stamp=False) -> Counter:
    tokens = word_tokenize(sentence)
    if len(tokens) > 0:
        tokens[0] = tokens[0].lower()
    sentence = Counter(tokens)
    stopwords_list = set(stopwords.words(language))
    stop_punctuation = stopwords_list.union(resources.punctuation).union(
        resources.ambiguous)
    filtered = utility.filter_by_set(sentence, stop_punctuation)
    if stamp:
        print("---Removing Stopwords---")
        print("Stopwords in", language, ":", stopwords_list)
        print("Sentence with stopwords and punctuation removed:\n", filtered)
    return filtered
Ejemplo n.º 15
0
def Topusage():
    new_tweets = api.user_timeline(screen_name='@narendramodi',
                                   count=200,
                                   tweet_mode='extended')
    for tweet in new_tweets:
        #print(tweet.full_text)
        temp = []
        temp.append(tweet.full_text)
        temp1 = temp
        import re
        words = re.sub(r"http\S+", " ", str(temp1))
        word = words.split()
        word1 = [w for w in word if w in stop_words]
        for w in word1:
            if w not in stop_words:
                word1.append(w)
        num = Counter(word1).most_common(10)
        print(num)
Ejemplo n.º 16
0
namedEnt.draw()


#6 Bag of Words Model-Most common words datacamp 

from nltk import Counter
from nltk.corpus import stopwords
# so far we already have text, lets do a quick bow model:


alpha_tokens_lower= [w.lower() for w in nltk.word_tokenize(text) if w.isalpha()]
no_stops=[w for w in alpha_tokens_lower if w not in stopwords.words('english') ]


len(alpha_tokens_lower)-len(no_stops)
words_count=Counter(no_stops)

words_count.most_common(17)

#7 Bag of words model Udemy:

#we already have the text:
import re
dataset=nltk.sent_tokenize(text)


#lowercase, non-alpha to singe space
for i in range(len(dataset)):
    dataset[i]=dataset[i].lower()
    dataset[i]=re.sub(r'\W', ' ', dataset[i])
    dataset[i]=re.sub(r'\s+',' ',dataset[i])  
Ejemplo n.º 17
0
 def count_words(cls, items: list):
     counter = Counter(items)
     return counter.most_common(5)
Ejemplo n.º 18
0
 def _token_counter(self):
     """Return the counts of all tokens"""
     return Counter([word for doc in self.tokens for word in doc])
Ejemplo n.º 19
0
def lemmer(tokens) -> Counter:
    lemmed = Counter()
    for k in tokens.keys():
        lemmed.update({lemmatizer.lemmatize(k): tokens[k]})
    return lemmed
                    semantic_values = stringate_value(hypernom_subj, hypernom_dobj)
                    semantic_type.append(semantic_values)

    return semantic_type, sentences_analyzed


if __name__ == '__main__':
    verbs_bf = ['build', 'love', 'eat']

    for verb_base_form in verbs_bf:

        sentences = get_sentences_with_verb(verb_base_form)
        print('*' * 50)
        print('\nCurrent verb base form : {}\n'.format(verb_base_form))
        semantic_cluster, sentences_analyzed = get_semantic_cluster(sentences, verb_base_form)
        print('------ End extraction-----------')

        # Print stats
        sts_semantic_cluster = Counter(semantic_cluster)
        common_semantic_cluster = sts_semantic_cluster.most_common(5)
        plot_result(common_semantic_cluster, verb_base_form)

        print('\nAnalized {} sentences \nFor the verb in base form : {} pair of semantic type are:\n'
              .format(sentences_analyzed, verb_base_form))

        for s in sts_semantic_cluster:
            print('\t< {} > Count {} '.format(s, sts_semantic_cluster[s]))

        print('*' * 50)
        print('\n\n\n')
 def statistics(self, trainset):
     for sent in trainset:
         self.wordNumber += len(
             re.sub("[^\w']", " ", sent['sentence']).split())
         self.wordbackup += re.sub("[^\w']", " ", sent['sentence']).split()
     self.wordCounter = Counter(self.wordbackup)
Ejemplo n.º 22
0
def _modified_precision(candidate, references, n):
    """Calculate modified ngram precision.

    The normal precision method may lead to some wrong translations with
    high-precision, e.g., the translation, in which a word of reference
    repeats several times, has very high precision. So in the modified
    n-gram precision, a reference word will be considered exhausted after
    a matching candidate word is identified.

    Paper examples:

    >>> _modified_precision(
    ...    'the the the the the the the'.split(),
    ...    ['the cat is on the mat'.split(), 'there is a cat on the mat'.split()],
    ...    n=1,
    ... )
    0.28...

    >>> _modified_precision(
    ...    'the the the the the the the'.split(),
    ...    ['the cat is on the mat'.split(), 'there is a cat on the mat'.split()],
    ...    n=2,
    ... )
    0.0

    >>> _modified_precision(
    ...    'of the'.split(),
    ...    [
    ...        'It is a guide to action that ensures that the military will forever heed Party commands.'.split(),
    ...        'It is the guiding principle which guarantees the military forces always being under the command of the Party.'.split(),
    ...        'It is the practical guide for the army always to heed the directions of the party'.split(),
    ...    ],
    ...    n=1,
    ... )
    1.0

    >>> _modified_precision(
    ...    'of the'.split(),
    ...    [
    ...        'It is a guide to action that ensures that the military will forever heed Party commands.'.split(),
    ...        'It is the guiding principle which guarantees the military forces always being under the command of the Party.'.split(),
    ...        'It is the practical guide for the army always to heed the directions of the party'.split(),
    ...    ],
    ...    n=2,
    ... )
    1.0

    More examples:

    >>> weights = [0.25, 0.25, 0.25, 0.25]
    >>> candidate1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
    ...               'ensures', 'that', 'the', 'military', 'always',
    ...               'obeys', 'the', 'commands', 'of', 'the', 'party']

    >>> candidate2 = ['It', 'is', 'to', 'insure', 'the', 'troops',
    ...               'forever', 'hearing', 'the', 'activity', 'guidebook',
    ...               'that', 'party', 'direct']

    >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
    ...               'ensures', 'that', 'the', 'military', 'will', 'forever',
    ...               'heed', 'Party', 'commands']

    >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which',
    ...               'guarantees', 'the', 'military', 'forces', 'always',
    ...               'being', 'under', 'the', 'command', 'of', 'the',
    ...               'Party']

    >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
    ...               'army', 'always', 'to', 'heed', 'the', 'directions',
    ...               'of', 'the', 'party']

    Unigrams:

    >>> _modified_precision(
    ...    candidate1,
    ...    [reference1, reference2, reference3],
    ...    n=1,
    ... )
    0.94...

    >>> _modified_precision(
    ...    candidate2,
    ...    [reference1, reference2, reference3],
    ...    n=1,
    ... )
    0.57...

    Bigrams:

    >>> _modified_precision(
    ...    candidate1,
    ...    [reference1, reference2, reference3],
    ...    n=2,
    ... )
    0.58...

    >>> _modified_precision(
    ...    candidate2,
    ...    [reference1, reference2, reference3],
    ...    n=2,
    ... )
    0.07...

    """
    counts = Counter(ngrams(candidate, n))

    if not counts:
        return 0

    max_counts = {}
    for reference in references:
        reference_counts = Counter(ngrams(reference, n))
        for ngram in counts:
            max_counts[ngram] = max(max_counts.get(ngram, 0),
                                    reference_counts[ngram])

    clipped_counts = dict((ngram, min(count, max_counts[ngram]))
                          for ngram, count in counts.items())

    return sum(clipped_counts.values()) / sum(counts.values())
import pandas
import vincent
from nltk import Counter

from DB import db
from Preprocessing import preprocess, stop

count = 0
db = db()

allTweets = db.getAll()

count_all_hashtags = Counter()
count_all_terms = Counter()
dates_hashtag = []
for tweet in allTweets:
    tweetText = tweet['text'].lower()
    # Bigrams list
    termsWithoutStopwords = [
        term for term in preprocess(tweetText) if term not in stop
    ]
    # termsBigrams = bigrams(termsWithoutStopwords)

    # Hashtags list
    terms_hash = [
        term for term in preprocess(tweetText) if term.startswith('#')
    ]
    if '#marchfortruth' in terms_hash:
        dates_hashtag.append(tweet['created_at'])

    # Update the counter(s)
Ejemplo n.º 24
0
def stemmer(tokens) -> Counter:
    stemmed = Counter()
    for k in tokens.keys():
        stemmed.update({stemmatizer.stem(k): tokens[k]})
    return stemmed
Ejemplo n.º 25
0
def get_features(text, setting):
    if setting == 'bow':
        return {word: count for word, count in Counter(preprocess(text)).items() if not word in stoplist}
    else:
        return {word: True for word in preprocess(text) if not word in stoplist}
Ejemplo n.º 26
0
#count words in list(data_split), if word already in list, pass, but if not,
#add it to the current wordcount dictionary. Counting frequency
for item in data_split:
    if item in wordcount.keys():
        wordcount[item] += 1
    else:
        wordcount[item] = 1

qstring = "I think I will get the best score in the class"
qstring_split = qstring.split()
qstring_dict = {}
for word in qstring_split:
    if word in qstring_dict.keys():
        qstring_dict[word] += 1
    else:
        qstring_dict[word] = 1
#count bigrams in the text file
from nltk import Counter
data_bi = Counter(nltk.bigrams(data_split))
q_bi = Counter(nltk.bigrams(qstring_split))
#count the probability of each word in wordcount dictionary
biprob_list = []
for item in q_bi:
    if item in data_bi:
        biprob_list.append(q_bi[item] / data_bi[item])
    else:
        bi_prob = 0
total_prob = 1
for prob in biprob_list:
    total_prob = total_prob * prob