Beispiel #1
1
def process(f, return_tokens=True, return_freqdist=True):
    """
    Function to process deals data.
    Splits text into sentences. FreqDist is incremented from tokenization.
    Using PunktWordTokenizer, since it is a decent regexp-based tokenizer.
    Deals are also about domain names. Not intending to split it up

    :rtype : FreqDist, list() of str
    :param f: Input file with a deal per line
    """
    fd = FreqDist()
    tokens = []
    fh = open(f, 'r')
    sentences = [line.strip() for line in fh.readlines()]
    for line in sentences:
        t = []
        for word in PunktWordTokenizer().tokenize(line.lower()):
            if word not in set(stopwords.words('english')) and word not in set(string.punctuation):
                if return_tokens:
                    t.append(word)
                if return_freqdist:
                    fd.inc(word)
        tokens.append(t)
    fh.close()
    return fd, sentences, tokens
def get_word_features(wordlist):

    wordlist = FreqDist(wordlist)

    word_features = wordlist.keys()

    return word_features
    def __init__(
        self,
        unk_cutoff,
        jm_lambda=0.6,
        dirichlet_alpha=0.1,
        katz_cutoff=5,
        kn_discount=0.1,
        kn_concentration=1.0,
        tokenize_function=TreebankWordTokenizer().tokenize,
        normalize_function=lower,
    ):
        self._unk_cutoff = unk_cutoff
        self._jm_lambda = jm_lambda
        self._dirichlet_alpha = dirichlet_alpha
        self._katz_cutoff = katz_cutoff
        self._kn_concentration = kn_concentration
        self._kn_discount = kn_discount
        self._vocab_final = False

        self._tokenizer = tokenize_function
        self._normalizer = normalize_function

        # Add your code here!
        self._vocab_freq = FreqDist()
        self._gram_freq = FreqDist()
        self._context_freq = FreqDist()

        self._vocab_freq[kSTART] += kUNK_CUTOFF + 1
        self._vocab_freq[kEND] += kUNK_CUTOFF + 1
Beispiel #4
0
def test():
    global N, words, network

    print 'In testing.'

    gettysburg = """Four score and seven years ago our fathers brought forth on this continent, a new nation, conceived in Liberty, and dedicated to the proposition that all men are created equal. Now we are engaged in a great civil war, testing whether that nation, or any nation so conceived and so dedicated, can long endure. We are met on a great battle-field of that war. We have come to dedicate a portion of that field, as a final resting place for those who here gave their lives that that nation might live. It is altogether fitting and proper that we should do this. But, in a larger sense, we can not dedicate -- we can not consecrate -- we can not hallow -- this ground. The brave men, living and dead, who struggled here, have consecrated it, far above our poor power to add or detract. The world will little note, nor long remember what we say here, but it can never forget what they did here. It is for us the living, rather, to be dedicated here to the unfinished work which they who fought here have thus far so nobly advanced. It is rather for us to be here dedicated to the great task remaining before us -- that from these honored dead we take increased devotion to that cause for which they gave the last full measure of devotion -- that we here highly resolve that these dead shall not have died in vain -- that this nation, under God, shall have a new birth of freedom -- and that government of the people, by the people, for the people, shall not perish from the earth."""
    tokenizer = RegexpTokenizer('\w+')
    gettysburg_tokens = tokenizer.tokenize(gettysburg) 

    samples = []
    for token in gettysburg_tokens:
        word = token.lower()
        if word not in ENGLISH_STOP_WORDS and word not in punctuation:
            samples.append(word)

    dist = FreqDist(samples)
    V = Vol(1, 1, N, 0.0)
    for i, word in enumerate(words):
        V.w[i] = dist.freq(word)

    pred = network.forward(V).w
    topics = []
    while len(topics) != 5:
        max_act = max(pred)
        topic_idx = pred.index(max_act)
        topic = words[topic_idx]

        if topic in gettysburg_tokens:
            topics.append(topic)
    
        del pred[topic_idx]

    print 'Topics of the Gettysburg Address:'
    print topics
Beispiel #5
0
 def top_words_from_corpus(self, num_words, test_name):
     corpus_tokens = []
     for i in self.corpus_vars["corpus_member_ids"]:
         title = 'document_' + str(i)
         doc_tokens = Library.document_instances[title].metadata["tokenized_doc"]
         corpus_tokens += doc_tokens
     top_words = []
     fdist_corpus = FreqDist(corpus_tokens)
     fdist_list = fdist_corpus.items()
     if test_name == "Function Word PCA":
         function_pos = ['IN', 'TO', 'CC', 'DT', 'PDT', 'WDT']
         for i in fdist_list:
             top_words.append(i[0])
             if len(top_words) == num_words:
                 tagged_top = nltk.pos_tag(top_words)
                 for j,k in tagged_top:
                     if k not in function_pos:
                         top_words.remove(j)
                 if len(top_words) == num_words:
                     break
     elif test_name == "Burrows's Delta":
         for i in fdist_list:
             top_words.append(i[0])
             if len(top_words) == num_words:
                 break
     return top_words
Beispiel #6
0
    def generate_ngrams_profile(self, text, profile_size, min_size=2, max_size=3):
        """
        It reads incoming text, generates all possible N-grams, with sizes ranging between min_size and max_size and counts the occurrences of all N-grams.

        Parameters
        ----------
        text : unicode

        profile_size : int

        min_size : int, optional (default=2)

        max_size : int, optional (default=3)

        Returns
        -------
        ngram_profile : FreqDist object

        """
        raw_ngrams = []
        text = self.sanitize_text(text)
        for n in range(min_size, max_size+1):
            for ngram in ngrams(text, n):
                raw_ngrams.append(''.join(unicode(i) for i in ngram))
        fdist = FreqDist(raw_ngrams)
        ngram_profile = fdist.most_common(n=profile_size)
        return ngram_profile
Beispiel #7
0
def make_cutOff(flatList, bottomCutOff, topCutOff):
    '''
    INPUT:
    flatList is a 1-d list of all tokens in set of tweets and both bottom and
    topCutOff are intergers
    OUTPUT:
    newVocab = a 1-d list of all tokens we want to keep
    thrownOut = a 1-d list of all tokens to throw out
    '''
    fd = FreqDist(flatList)
    newVocab = []
    thrownOut = []
    
    for item in fd.items()[:topCutOff]:
        # append most common words
        thrownOut.append(item)

    for item in fd.items()[topCutOff:]:
        if item[1] > bottomCutOff:
            # append good words
            newVocab.append(item[0])
        else:
            # append uncommon words
            thrownOut.append(item)

    print 'Cutoffs made...'
    return newVocab, thrownOut
def main():
    userInput = parser.getInput()
    fileList = parser.getFiles(userInput['train'])
    pdata = parser.parseFiles(fileList)





    allsent = ''
    for f in pdata:
        allsent += f[3]

    all_words = FreqDist(w.lower()
                    for w in word_tokenize(allsent)
                        if w not in stopwords.words('english') )

    global top_words
    top_words = all_words.keys()[:500]

    # pdata = getParseData()
    featdata = featureAggregator(pdata)







    print featdata[:10]
def get_hosts(year):
    '''Hosts is a list of one or more strings. Do NOT change the name
    of this function or what it returns.'''
    # Your code here
    file_name = 'gg%s.json' % year
    with open(file_name, 'r') as data:
        db = json.load(data)
    hosts = []
    pairs = []
    for f in db:
        e = f['text']
        if 'and' in e.lower():
            for proper in strip_proper_pairs(normalize_str(e).split()):
                pair = proper.split('and')
                if len(pair) == 2:
                    if pair[0] != ' ' and pair[1] != ' ':
                        pairs.append((pair[0].lower().replace('\'','\"').strip(' '), pair[1].lower().replace('\'','\"').strip(' ')))
    pairs_freq = FreqDist(pairs)
    if len(pairs_freq.most_common(10)[0][0][0].split(' ')) < 2:
        hosts.append(pairs_freq.most_common(10)[1][0][0])
        hosts.append(pairs_freq.most_common(10)[1][0][1])
    else:
        hosts.append(pairs_freq.most_common(10)[0][0][0])
        hosts.append(pairs_freq.most_common(10)[0][0][1])
    return hosts
def BootstrapFD(samp):
    fd = FreqDist(samp)
    f1 = float(fd.Nr(1))
    f2 = float(fd.Nr(2))
    N = float(fd.N())
    B = fd.B()
    # Undetected species & Coverage
    if f2 > 0.0:
        f0 = ceil(((N - 1.0) / N) * (f1 ** 2.0) / (2.0 * f2))
        C = 1.0 - f1 / N * (N - 1.0) * f1 / ((N - 1.0) * f1 + 2.0 * f2)
    else:
        f0 = ceil(((N - 1.0) / N) * f1 * (f1 - 1.0) / 2.0)
        C = 1.0 - f1 / N * (N - 1.0) * f1 / ((N - 1.0) * f1 + 2.0)
        # Correct abundances
    probs = array(fd.values()) / N
    lambdah = (1 - C) / sum(probs * (1 - probs) ** N)
    probs = probs * (1 - lambdah * (1 - probs) ** N)
    # P for unseen
    # paux = (1-C)/f0
    yield fd.values()
    popO = arange(B)
    dist = binom(n=N, p=1 - C)
    probsA = probs / sum(probs)
    while True:
        ns2 = dist.rvs()
        ns1 = int(N) - ns2
        if ns1 > 0:
            samp1 = list(choice(popO, size=ns1, replace=True, p=probsA))
        else:
            samp2 = []
        if ns2 > 0:
            samp2 = list(random_integers(B, B + int(f0) - 1, ns2))
        else:
            samp2 = []
        yield FreqDist(samp1 + samp2).values()
def setUpOwnSubjectStopWords():
    for topic in topics_table_noun_only_title:
        #only limiting it to a specified length

        #might want to look into the numeric part
        all_description = [ds for ds in topics_table_noun_only_description[topic] if len(ds) > 5].join()
        all_topics = [topics for topics in topics_table_noun_only_title[topic] if len(ds) > 5].join()


        fdist_description = FreqDist(all_description)
        fidst_topics = FreqDist(all_topics)

        ten_most_common_descr = fdist_description.most_common(10)
        ten_most_common_topic = fdist_description.most_common(10)
        built_topic_stop_words[topic] = [word for word,freq in ten_most_common_descr ]
        built_topic_stop_words[topic].append([word for word, freq in ten_most_common_topic])

        #here we set up the top 5-10 words (we need to look into the data more to find
        #the hard margin of the good numerical value to stop, but for simplicity sake, we
        #pick 5 for now, let's see how our accuracy changes when change the most frequent words


    for topic in built_topic_stop_words:
        print built_topic_stop_words[topic]
        print "\n"
Beispiel #12
0
def category_by_movie():
    from nltk.corpus import movie_reviews as mr
    from nltk import FreqDist
    from nltk import NaiveBayesClassifier
    from nltk import classify
    from nltk.corpus import names
    from nltk.classify import apply_features
    import random

    documents = [(list(mr.words(f)), c) for c in mr.categories() for f in
mr.fileids(c)]
    random.shuffle(documents)

    all_words = FreqDist(w.lower() for w in mr.words())
    word_features = all_words.keys()[:2000]

    def document_features(document):
        document_words = set(document)
        features = {}
        for word in word_features:
            features['contains(%s)' % word] = (word in document_words)
        return features

    #print document_features(mr.words('pos/cv957_8737.txt'))
    #print documents[0]

    features = [(document_features(d), c) for (d, c) in documents]
    train_set, test_set = features[100:], features[:100]
    classifier = NaiveBayesClassifier.train(train_set)
    print classify.accuracy(classifier, train_set)
Beispiel #13
0
def main():
    keyword_list = ["Top Secret", "Secret Service", "Classified", "Targeted", "Assassination",
                    "Kill Program", "NSA", "wire", "CIA", "FBI", "DEA", "DOJ", "hackers",
                    "hacker", "exploit code", "Defense", "Intelligence", "Agency"]
    file_name = "tweets_output.txt"
    pickle_words_file = "words.pickle"
    pickle_words(file_name, pickle_words_file, keyword_list)
    pickle_tweets_file = "tweets.pickle"
    pickle_tweets(file_name, pickle_tweets_file)
    words = load(open("words.pickle"))
    tweets = load(open("tweets.pickle"))
    freq_dist = FreqDist(words)
    print tweets
    print("===")
    print("Conducting Frequency and Lexical Diversity Analysis of Twitter Search Space: ")
    print("===")
    print("Number of words within the twitter search space: ")
    print(len(words))
    print("Number of unique words within twitter search space: ")
    print(len(set(words)))
    print("Lexical Diversity of unique words within twitter search space: ")
    print(lexical_diversity(words))
    print("===")
    print("Conducting Native Language Processing Analysis Utilizing Python NLTK")
    print("===")
    print("Top 50 Frequent Words within the Twitter Search Space: ")
    print(freq_dist.keys()[:50])
    print("===")
    print("Bottom 50 Frequent Words within the Twitter Search Space: ")
    print(freq_dist.keys()[-50:])
    print("===")
Beispiel #14
0
 def find_names(self):
     """creates a frequency distribution of the
     most common names in the texts"""
     names_list = LIST_OF_NAMES
     name_tokens = [w for w in self.tokens if w in names_list]
     fd = FreqDist(name_tokens)
     return fd.most_common(50)
Beispiel #15
0
class Index:
    """
    The Index class stores an index for a document.
    """
    def __init__(self):
        self._freq_dist = None
        self._document = None

    def index(self, document):
        self._document = document
        if self._freq_dist == None:
            self._freq_dist = FreqDist()
            for term in self.terms():
                self._freq_dist.inc(term)

    def reset(self):
        "Reset the index"
        self._freq_dist = None

    def freq_dist(self):
        if self._freq_dist == None:
            self.index()
        return self._freq_dist

    # return the number of times a term appears in this document
    def freq(self, term):
        if not self._freq_dist:
            self.index()
        return self._freq_dist[term]

    def tf(self, term):
        if not self._freq_dist:
            self.index()
        return float(self._freq_dist[term]) / float(self._freq_dist.N())
Beispiel #16
0
    def palavrasChaves(self):
        # fun��o da NLTK que retorna as stopwords na lingua inglesa
        stopE = stopwords.words('english')

        # fun��o da NLTK que retorna as stopwords na lingua portuguesa
        stop = stopwords.words('portuguese')  
              
        stopS = stopwords.words('spanish')
        
        palavrasChaves = [] 
        textoArtigo = []
        
        #retira pontua��es do texto e divide o texto em palavras
        for i in self.titulo.lower().replace(',','').replace('.','').replace('-','').replace('(','').replace(')','').split():
            #retira as stopwords da lingua portuguesa do texto do artigo que est� sendo apresentado
            if i not in stop:
                #retira as stopwords da lingua inglesa do texto do artigo que est� sendo apresentado
                if i not in stopE:
                    #ignora palavras com menos de 3 caracteres. Isso � para tratar palavras, como por exemplo o verbo "�"
                    if i not in stopS:
                            if len(i) > 2:
                                textoArtigo.append(i)
        
        # apresenta a frequencia de repeticoes das palavras no corpo do artigo
        freq = FreqDist(textoArtigo)
        
        # separa as quatro palavras mais frequentes
        items = freq.items()[:4]
        
        # coloca as palavras mais frequentes do texto na variavel palavrasChaves
        for i in range(0,len(items)):
            palavrasChaves.append(items[i][0])
            
        return palavrasChaves        
Beispiel #17
0
  def __init__(self, num_topics, alpha_topic = 1.0, alpha_word = 1.0, 
               max_tables = 50000, sanity_check=False, initialize=False,
               report_filename="topic_history.txt"):

    self.max_tables = max_tables
    self._alphabet = FreqDist()
    # store all words seen in a list so they are associated with a unique ID.

    self.initialize_index()

    self._words = FreqDist()

    self.alpha_topic = alpha_topic
    self.alpha_word = alpha_word

    self._num_updates = 0
    self._report = None
    if report_filename:
        self._report = open(report_filename, 'w')

    self.num_topics = num_topics
    self._topics = [FreqDist() for x in xrange(num_topics)]

    # the sanity_check flag is for testing only. 
    if initialize and sanity_check == True:
        self.deterministic_seed()
    elif initialize:
        self.initialize_topics()
Beispiel #18
0
def get_top_followings(screen_name):
    # authorize twitter, initialize tweepy

    api = TwitterGrabber.initialise_api(0)

    print(api.get_status)

    # initialize a list to hold all the tweepy Tweets
    all_tweets = []

    # make initial request for most recent tweets (200 is the maximum allowed count)
    new_tweets = api.user_timeline(screen_name=screen_name, count=200)

    # get the user object
    # user = api.get_user(screen_name=screen_name)
    # print(user.lists_subscriptions)

    # save most recent tweets
    all_tweets.extend(new_tweets)

    # save the id of the oldest tweet less one
    oldest = all_tweets[-1].id - 1

    # keep grabbing tweets until there are no tweets left to grab
    while len(new_tweets) < 0:
        # print("getting tweets before %s" % oldest)

        # all subsequent requests use the max_id param to prevent duplicates
        new_tweets = api.user_timeline(screen_name=screen_name, count=200, max_id=oldest)

        # save most recent tweets
        all_tweets.extend(new_tweets)

        # update the id of the oldest tweet less one
        oldest = all_tweets[-1].id - 1

        print("...%s tweets downloaded so far" % (len(all_tweets)))

    tweet_text = []

    for tweet in all_tweets:
        tweet_text.append(tweet.text)

    content = []
    retweets = []

    for tweet in tweet_text:
        words = word_tokenize(tweet, 'english')
        content.extend(strip_words(words))

        if words[0] == 'RT':
            retweets.append(words[2])

    tweet_distribution = FreqDist(retweets)

    print(tweet_distribution.most_common(20))

    a = follow_description(api, tweet_distribution.most_common(20), screen_name)

    return a
Beispiel #19
0
def cleaner(filename):
	textfile = open(os.path.join(app.config['UPLOAD_FOLDER'], filename),'r')
	text = []
	all_dates = []
	complete_text = []
	words_list = []
	nodes = []
	for line in textfile:
		datetime,chat = line.split('-')
		date, time = datetime.split(',')
		loc = chat.find(':')

		#if len(chat.split(':'))==3:
		#	print chat
		user,text = chat[:loc],chat[loc+2:]
		text = text.replace("\n",'')
		words = text.split(' ')
		for i in words:
			words_list.append(i)
		complete_text.append(text)
		nodes.append(user)
		all_dates.append(date)

	#print set(nodes)
	#print set(all_dates)
	fdist = FreqDist(words_list)
	f1 = fdist.most_common(100)
	create_csv('wordcloud.csv',f1)
	textfile.close()
def train_wordfrequency(n_dims = 50):
    ### Load data
    dataloader = csv_dataloader()
    dataloader.load('output/data_cache.pk')
    print "Read in finished"

    train_id = dataloader.id
    _, pos_id, neg_id = dataloader.balance(train_id, 'full')
    train_data_pos = dataloader.data_retrieve(pos_id)
    train_data_neg = dataloader.data_retrieve(neg_id)
    tokens = sum(dataloader.data.viewvalues(), [])
    tokens_pos = sum(train_data_pos['data'].viewvalues(), [])
    tokens_neg = sum(train_data_neg['data'].viewvalues(), [])

    fdist_base = FreqDist(tokens)

    fdist_pos = FreqDist(tokens_pos)
    fdist_pos = normalize(fdist_pos, fdist_base)
    fdist_neg = FreqDist(tokens_neg)
    fdist_neg = normalize(fdist_neg, fdist_base)

    print list(fdist_pos.viewkeys())[:100]
    print list(fdist_neg.viewkeys())[:100]

    labels_pos = [1] * len(tokens_pos)
    labels_neg = [0] * len(tokens_neg)

    labels = labels_pos + labels_neg
    corpus = tokens_pos + tokens_neg
Beispiel #21
0
def follow_description(api, friend_list, screen_name):
    the_list = []
    all_tags = []

    for friend in friend_list:
        username = friend[0]
        frequency = friend[1]

        print(username)

        try:
            user = api.get_user(screen_name=username)
            for list_obj in user.lists_memberships(screen_name=username, count=50):
                for w in list_obj.name.lower().split(" "):
                    # print(w)
                    all_tags.append(w)

        except TweepError as err:
            print(err.reason)
            break

    # print(all_tags)
    the_list_name = strip_words(all_tags)
    the_list_dist = FreqDist(the_list_name)

    # for w in the_list_dist:
    #     print ('***' + str(w))

    print(the_list_dist.most_common(20))
    return the_list_dist.most_common(20)
Beispiel #22
0
def analyzeTitles():
    fulltitles = []
    titles = []
    with open('../top100clean.csv', 'rb') as bookfile:
        reader = csv.reader(bookfile)
        for row in reader:
            if "..." in row[0]:
                row[0] = " ".join(row[0].split(" ")[:-1])
            words = nltk.word_tokenize(row[0])
            for w in words:
                if w.isalpha() and w.lower() not in ['the','a']:
                    titles.append(w.lower())
            fulltitles.append(row[0])

    titleset = nltk.Text(titles)
    wordsintitle = [len(f.split(" ")) for f in fulltitles]
    wit_fd = FreqDist(wordsintitle)
    print "\nw.i.t.\tfreq"
    print "--------------------"
    for numwords, times in wit_fd.iteritems():
        print str(numwords) + "\t" + str(times)
    print "\n"

    print "\nword\t\tfreq"
    print "--------------------"
    fd = FreqDist(titleset)
    common_words = fd.most_common(25)
    for k, v in common_words:
        print str(k) + "\t\t" + str(v)
Beispiel #23
0
   def top(self, tokens, lowest_rank=50):
      ''' A list of the most frequent (non-stopword) tokens '''
      from operator import itemgetter
      content = self.words(tokens)

      fdist = FreqDist(content)
      vocab = iter(fdist.keys())

      # Forget all previous ranking
      self.lower_words = {}
      frequency = 0
      while frequency < lowest_rank:
         try:
            word = vocab.next()
         except StopIteration:
            break

         word_lower = word.lower()
         if word_lower in self.lower_words:
            self.lower_words[word_lower] = self.lower_words[word_lower] + fdist[word]
         else:
            self.lower_words[word_lower] = fdist[word]

         frequency = frequency + 1

#      return sorted(self.lower_words, key=itemgetter(1), reverse=True)
      return map(itemgetter(0), sorted(self.lower_words.items(), key=itemgetter(1), reverse=True))
Beispiel #24
0
def bag_of_words(data, label_codebook, feature_codebook, theta):
    """"""
    word_dict = Alphabet()
    stopset = set(stopwords.words('english'))
    for key, value in data.items():
        label_codebook.add(key)
        for doc in value:
            doc_tokens = set(nltk.regexp_tokenize(doc, pattern="\w+"))
            for word in doc_tokens:
                if word not in stopset:
                    word_dict.add(word)
                    
    all_words = word_dict._label_to_index.keys()
    fdict = FreqDist([w for w in all_words])
    word_feature = fdict.keys()[theta:]
    for word in all_words:
        if word in word_feature:
            feature_codebook.add(word)
    
    instance_list = {}
    for label, document_list in data.items():
        instance_list[label] = []
        for document in document_list:
            vector = np.zeros(feature_codebook.size())
            tokens = set(nltk.regexp_tokenize(document, pattern="\w+"))
            indice = 0
            
            for word in tokens:
                if feature_codebook.has_label(word):
                    indice = feature_codebook.get_index(word)
                    vector[indice] = 1.
            instance_list[label].append(vector)
    return instance_list
Beispiel #25
0
def create_word_freq(db):
  db = getattr(db, "Posts")
  #client.command("CREATE CLASS concepted EXTENDS E")

  client.command("DELETE EDGE concepted")
  #client.command('create property frequency.freq string')

  #client.command("DELETE VERTEX frequency")
  data =  db.find().batch_size(50)
  concept = client.command("SELECT name FROM concept")
  c = [c.name for c in concept]
  for d in data:
    if not 'Body' in d:
        display= ''
    else:
        display= cleanhtml(d['Body'].replace('\n', ' ').replace('\r', '').replace('\\', ''))
        tokens = nltk.word_tokenize(display)
        fdist=FreqDist(tokens)
        i  = fdist.most_common()
        for k in i:
          if k[0].lower() in c:
            try:
                client.command("CREATE EDGE concepted FROM (SELECT FROM concept WHERE name = '{0}') TO (SELECT FROM Content WHERE PostId = {1}) SET strength = {2}".format(k[0].lower(),d['_id'],k[1]))
            except:
              continue
def process_tweets (hashtag,addl_stops=[]):
    count=0
    good_count=0
    words_to_plot=[]
    #Iterate through all chunked files with relevant hashtag
    for fname in os.listdir(os.getcwd()):
        if fname.startswith(hashtag):
            with open(fname,'r') as data_file:
                data=data_file.read()
                # Parse raw string since json.load() approach wasn't working
                data=data.split("\n\x00,")
            for tweet in data:
                count+=1
        
                # Tweets have a well-defined structure, so we can parse them 
                # manually (even though the JSON approach would be cleaner)
                text=tweet[tweet.find("text\":")+7:tweet.find(",\"source\"")-1]
                
                # Skip tweets that contain Unicode
                if text.find('\u')>=0:
                    continue
                else:
                    good_count+=1
                    # Tokenize and count word frequency, ignoring case
                    words = word_tokenize(text)
                    clean_words= [w.lower() for w in words if not w.lower() in set(stops+addl_stops)]
                    words_to_plot=words_to_plot+clean_words             
    
    #Create frequency histogram of 50 most common words and print summary of activity 
    fdist=FreqDist(words_to_plot)
    fdist.plot(50)
    print "for "+hashtag+' we collected '+str(count)+' tweets out of which '+str(good_count)+" will be analyzed"
    return words_to_plot
Beispiel #27
0
def featureset(sample):
  comment, label = sample
  features = {}
#  tags = map(lambda statement: map(lambda (w,t):t, statement), comment)
  words = map(lambda statement: map(lambda (w,t):w, statement), comment)
  words = sum(words, [])
#  tags = sum(tags, [])
  size_= sum([len(word) for word in words])
  features['stmt_len'] = len(words)/float(len(comment))
  features['word_len'] = size_/float(len(words))
  features['size'] = size_
#  tags_dist = FreqDist(sum(tags, []))
#  for tag in TAGS:
#    features[tag] = tags_dist.get(tag, 0)
  dist = FreqDist([word.lower() for word in words])
#  num_stop_words = float(sum([dist.get(word, 0) for word in EN_STOPWORDS]))
#  features['prob_stop_words'] = num_stop_words/len(words)
  for word in EN_STOPWORDS:
    features[word] = dist.get(word, 0)/float(len(words))
  features['alwayson'] = 1.0
  for language in LANGUAGES:
    for i in range(1,n+1):
      word_sim, tag_sim, char_sim, w_s_sim = comment_similarity(GRAMS[language], comment, i)
      features['w_sim_%d_%s' % (i, language)] = word_sim
      features['t_sim_%d_%s' % (i, language)] = tag_sim
      features['c_sim_%d_%s' % (i, language)] = char_sim
#     features['s_sim_%d_%s' % (i, language)] = w_s_sim
  return (features, label)
def getTopNFreqWords(textArr,N):
    fdist = FreqDist(textArr)
    topWordsWithFreq = fdist.most_common(N)
    topWords=[]
    for word in topWordsWithFreq:
        topWords.append(word[0])
    return topWords
Beispiel #29
0
def posAnalysis(collection):

	reviews = collection.find(timeout=False)

	__reportProgress.counter = 0

	skip = 1

	for rev in reviews:
		if skip%200 == 0:
			print 'skip'+str(skip)
		__reportProgress()
		if rev.has_key('tags'):
			skip += 1
			if rev['tags'].has_key('NN'):				
				continue

		sents = sent_tokenize(rev['text'])
		tokens = [word for sent in sents for word in word_tokenize(sent)]
		pos = tagger.tag([tok for tok in tokens if tok not in ',.-$\" '])
		tag_fd = FreqDist(tag for (word, tag) in pos)
		tags = dict()
		for (key,value) in tag_fd.items():
			k = key.replace('$','S')
			out = key.translate(string.maketrans("",""), string.punctuation)
			if len(out)>0:
				tags[k] = value
		collection.update({'_id':rev['_id']},{"$set": {"tags": tags}})		
def transmit_vocabulary(t_token, t_lang):
    languages = ['danish', 'dutch', 'english', 'finnish', 'french', 'german', 'hungarian', 'italian',
                 'norwegian', 'portuguese', 'russian', 'spanish', 'swedish', 'turkish']
    voc_stopwords = set()
    if t_lang in languages:
        voc_stopwords = set(stopwords.words(t_lang))
    i_f = codecs.open('csv/'+t_token+'.csv', 'r', 'utf-8')
    lines = i_f.readlines()
    all_tweets = []
    corpus_size = 0
    for line in lines:
        row = line.split('\t')
        words = word_tokenize(row[1])
        all_tweets.extend([w.lower() for w in words])
        corpus_size += 1
    freq_distribution = FreqDist(all_tweets)
    cats_vocabulary_elements = []
    for word, frequency in freq_distribution.most_common(1000):
        if word not in voc_stopwords:
            cats_vocabulary_elements.append('["' + word + '", ' + str(frequency) + ']')
    cats_vocabulary = '['+','.join(cats_vocabulary_elements)+']'
    print(cats_vocabulary)
    result_data = {'token': t_token, 'result': cats_vocabulary}
    json_data = json.dumps(result_data)
    results_request = urllib2.Request('http://mediamining.univ-lyon2.fr/cats/module/resultFile')
    results_request.add_header('Content-Type', 'application/json')
    results_request.data = json_data.encode('utf-8')
    urllib2.urlopen(results_request)
    print('Transmitted vocabulary for token '+t_token)
    os.remove('csv/' + t_token + '.csv')
Beispiel #31
0
# Read input files
BNC_file = open("Preprocessed_BNC.txt", "r")
CS_file = open("Preprocessed_CS.txt", "r")

# tokenize two files
BNC_words = word_tokenize(BNC_file.read())
CS_words = word_tokenize(CS_file.read())

# filter words to remove punctuation
filter_words = [
    ' ', '?', '!', ',', ';', ':', '-', '--', '---', '(', ')', '{', '}', '[',
    ']', "'", '"', '.', '`', '·', '``', '~', "''"
]

# Filtered two datasets to remove punctuations
BNC_filtered_word = [w for w in BNC_words if w not in filter_words]
CS_filtered_word = [w for w in CS_words if w not in filter_words]

# computing frequency for two dataset
BNC_frequency_Dist = FreqDist(BNC_filtered_word)
CS_frequency_Dist = FreqDist(CS_filtered_word)

BNC = BNC_frequency_Dist.most_common()
CS = CS_frequency_Dist.most_common()

BNC_final_list = find_range_word(BNC)
CS_final_list = find_range_word(CS)

print_list_word(BNC_final_list, 'B_bnc_output.txt')
print_list_word(CS_final_list, 'B_cs_output.txt')
def good_turing_trigram_model(data):
    trigram_distribution = FreqDist(data)
    good_turing_trigram = SimpleGoodTuringProbDist(trigram_distribution)
    return good_turing_trigram
Beispiel #33
0
stopwords = set(stopwords.words('english'))
common_count = 100
max_n = 3

sentences = [cleaner(s) for s in gutenberg.sents('austen-emma.txt')]

join_item = lambda x: ' '.join(x)
common_items = {}
for n in range(1, max_n+1):
    all_text = []
    for sentence in [s for s in sentences if len(s)>1]:
        grams = ngrams(sentence, n)
        for gram in grams:
            all_text.append(gram)
    cur_common = [item[0] for item in FreqDist(all_text).most_common(common_count)]
    if n==1:
        common_items[n] = [join_item(item) for item in cur_common]
    else:
        common_items[n] = []
        for item in cur_common:
            tmp = {x : 0 for x in item}
            for x in range(1,n):
                for gram in ngrams(item, x):
                    if join_item(gram) in common_items[x]:
                        for word in gram:
                            tmp[word] = 1
            if not reduce(lambda x, y: x*y, tmp.values()):
                common_items[n].append(join_item(item))
    common_items[n] = sorted(common_items[n])
for n, val in enumerate(common_items.values()):
Beispiel #34
0
    'ADJ_SAT': 's',
    'ADV': 'r',
    'NOUN': 'n',
    'VERB': 'v'
}
# We'll use the reuters corpus in NLTK.
# The same steps of preprocessing can be done on documents read in from external files.

# How many files are there in the corpus?
# What are their categories? Single or multiple categories for one file?
len(reuters.fileids())  #
cats = [
    reuters.categories(f) for f in reuters.fileids()
]  # for every file in retuers file reuter list  show categories for each of the articles and keep in cats
cat_num = [len(c) for c in cats]
fd_num = FreqDist(cat_num)
fd_num.plot()

# How many documents are there in each category?
# FreqDist() can be used to find the answer, but we need to flatten the list of categories first.
cats_flat = [
    c for l in cats for c in l
]  # cats contains list of lists ,so flattens each list inside the list
fd_cat = FreqDist(cats_flat)
fd_cat
fd_cat.most_common(20)

# Let's pick two categories and visualize the articles in each category using word cloud
grain = reuters.fileids('grain')
trade = reuters.fileids('trade')
#tokenised document
preprocessedStory = preprocess(storytext)
tokens = nltk.word_tokenize(preprocessedStory)
print(tokens[0:20])


def lexical_diversity(text):
    return len(set(text)) / len(text)


lexical_diversity(tokens)

len(tokens)

len(set(tokens))

fdist1 = FreqDist(tokens)
print(fdist1)

fdist1.plot(50, cumulative=True)

from nltk.corpus import stopwords
stop = stopwords.words('english')
remstop = [i for i in tokens if i not in stop]
remstop[0:20]

len(remstop)

len(set(remstop))

lexical_diversity(remstop)
Beispiel #36
0

def takeSecond(elem):
    return elem[1]


if __name__ == '__main__':
    # liste contenant des sous liste de chaque mot avec les pads <s> et </s> pour le debut et la fin de phrase
    preprocessed = [
        pad_both_ends(s.strip(' ').split(' '), n=2) for s in sentences('book')
    ]

    # avec flatten() element dans sous liste de preprocessed extrait pour faire une liste
    tokens = list(flatten(preprocessed))

    fd = FreqDist(tokens)  # unigramme (dict avec chaque mot et sa frequence

    model = bigrams(tokens)  # model du bigramme
    cfd = ConditionalFreqDist(
        model
    )  # bigramme (dict avec sous dict: clés = next mots et valeur = leur frequence)

    corpus = {}  # dict of corpus

    traincorpus(
    )  # make a json file of training of the bigram model from the corpus file (book)
    # [print(s, o) for s, o in fd.items()]

    nex = '<s>'
    sentence = []
    while nex != '</s>':
Beispiel #37
0
from sklearn.metrics import r2_score, make_scorer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from nltk.stem import WordNetLemmatizer
from nltk import FreqDist
from nltk.corpus import brown, wordnet
from scipy.stats import pearsonr, spearmanr
from matplotlib import pyplot as plt

wordnet_lemmatizer = WordNetLemmatizer()
stopwords = set(nltk.corpus.stopwords.words("english"))

tagger = nltk.tag.pos_tag
frequency_list = FreqDist(i.lower() for i in brown.words())
all_words_count = 0
for i in frequency_list:
    all_words_count += frequency_list[i]


def get_words(sentence):
    return [i.strip('., ') for i in sentence.split(' ')]


with open('word_to_vec', 'r') as f:
    embeddings = {}
    for line in f.readlines():
        args = get_words(line.strip("\n\t "))
        embeddings[args[0]] = [float(i) for i in args[1:]]
Beispiel #38
0
    # get sample tweets through remove noise methods adn convert them into words and put into list of positive or
    # negative
    positive_cleaned_tokens_list = []
    negative_cleaned_tokens_list = []

    for tokens in positive_tweet_tokens:
        positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

    for tokens in negative_tweet_tokens:
        negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

    all_pos_words = get_all_words(positive_cleaned_tokens_list)

    # running through the list of words from sample tweets and displays the most common features of the list,
    # this will give us a visual idea and we can interpret it towards the accuracy of the algorithms
    freq_dist_pos = FreqDist(all_pos_words)
    print("The most common 10 words:")
    print(freq_dist_pos.most_common(10))

    positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list)
    negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)

    positive_dataset = [(tweet_dict, "Positive")
                        for tweet_dict in positive_tokens_for_model]

    negative_dataset = [(tweet_dict, "Negative")
                        for tweet_dict in negative_tokens_for_model]

    dataset = positive_dataset + negative_dataset

    # shuffle data set to avoid bayes train and test set, this will ensure that we run aour algo on random data set
Beispiel #39
0
	sentence_list_nights.append((file, len(corpus_nightsII.sents(file))))
	sentence_dic_nights[file] = len(corpus_nightsII.sents(file))
	sentence_dic_nights = collections.OrderedDict(sentence_dic_nights) # we make sure that the order of the data stays the same 

# Which night has the most sentences?
for file, characters in sentence_list_nights:
	if characters == max(sentence_dic_nights.values()):
		print(file, characters)	# the Eight Hundred and Forty-fifth.txt => 399

# In the following block of code, we calculate what the average word length is in each night

dict_word_length = {}
for file in corpus_nightsII.fileids():
	text = corpus_nightsII.words(file)
	x = [len(words) for words in text]
	fdist = FreqDist(x)
	dict_word_length[file] = fdist.max()
print(dict_word_length)

# We now calculate the readability for each file. We do this by using the Automated Readability Index (ARI).

stat_list = []
x = word_dic_nights.keys()
for name in x:
	n_char = char_dict_night[name]
	n_words = word_dic_nights[name]
	n_sents = sentence_dic_nights[name]
	stat_list.append((name, n_char, n_words, n_sents))
print(stat_list)

def ARI(n_char, n_words, n_sents):
Beispiel #40
0
        output.append([token.lemma_ for token in doc if token.pos_ in tags])
    return output


tokenized_reviews = pd.Series(review).apply(lambda x: x.split())
reviews_2 = lemmatization(tokenized_reviews)
reviews_3 = []
for i in range(len(reviews_2)):
    reviews_3.append(' '.join(reviews_2[i]))
review = reviews_3
# 提取所有单词
list_data = []
for i in review:
    list_data += i.lower().split()
# 词频统计
fdist = FreqDist(list_data)
words_df = pd.DataFrame({
    'word': list(fdist.keys()),
    'count': list(fdist.values())
})
# 词频可视化
d = words_df.nlargest(columns="count", n=20)
plt.figure(figsize=(15, 5))
ax = sns.barplot(data=d, x="word", y="count")
ax.set(ylabel='Count')
plt.show()
# LDA主题建模
import pyLDAvis
import pyLDAvis.gensim
import gensim
from gensim import corpora
Beispiel #41
0
            allwords.append(j.lower())
# print allwords
for i in range(len(allwords)):
    allwords = [re.sub(r'[^\w\s]', '', s) for s in allwords]
allwords = set(allwords)
allwords = list(allwords)
y = np.zeros(len(allwords))
# print allwords
for i in range(len(allwords)):
    try:
        y[i] = int((complexity[allwords[i]]))

    except:
        y[i] = 0
# print y
fdist = FreqDist(brown.words())

x = []
for i in range(len(allwords)):
    x.append([])
for i in range(len(allwords)):
    x[i].append(fdist.freq(allwords[i]))
    x[i].append(len(allwords[i]))
    x[i].append(synobj.synCount(allwords[i]))
    x[i].append(ww.wdweight(allwords[i]))
    x[i].append(vc.vCount(allwords[i]))
    x[i].append(synobj.len_of_synonyms(allwords[i]))

classifier = DecisionTreeClassifier()
classify = classifier.fit((x[0:int(len(x) * 0.8)]), y[0:int(len(y) * .8)])
ypred = classifier.predict(XTest)
Beispiel #42
0
    for index, item in market_basket_0.loc[i].items():
        if item != 0:
            temp_set.add(item)
    transactions.append(temp_set)
# print(transactions)

# 提取所有产品并按照索引依次堆叠组成list,索引组成list
indexid = []
item = []
for m in range(len(transactions)):
    for x in transactions[m]:
        indexid.append(m)
        item.append(x)

# 计算词频
fre = FreqDist(item)
print(fre.most_common(10))

# 生成list
list_fre = list(fre.most_common(10))
list_item = []
list_count = []
for x in list_fre:
    list_item.append(x[0])
    list_count.append(x[1])

# 频率分布图
fre.tabulate(10)
fre.plot(10)

# 饼图..
Beispiel #43
0
from nltk import FreqDist
from common.books import text1

fdist = FreqDist(len(w) for w in text1())
print(fdist)
# print(fdist.keys())
# print(fdist.items())

print(fdist.most_common())
print(fdist.max())
print(fdist[3])
print(fdist.freq(3))
Beispiel #44
0
# In[15]:

tokens = nltk.word_tokenize(raw)
type(tokens)

# In[16]:

words1 = [w.lower() for w in tokens]  #list comprehension

#only keep text words, no numbers
words2 = [w for w in words1 if w.isalpha()]

# In[17]:

freq = FreqDist(words2)
sorted_freq = sorted(freq.items(), key=lambda k: k[1], reverse=True)
sorted_freq

# In[31]:

freq.plot(30)

# In[32]:

from nltk.corpus import stopwords
stopwords = stopwords.words('english')

# In[33]:

words_nostopwords = [w for w in words2 if w not in stopwords]
Beispiel #45
0
import nltk
from nltk import FreqDist, NaiveBayesClassifier
from nltk.corpus import movie_reviews
import random
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]
all_words = FreqDist(w.lower() for w in movie_reviews.words())
word_features = list(all_words)[:2000]
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()


def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
        if len(word) > 2:
            synsets = wn.synsets(word)
            if synsets:
                for synset in synsets:
                    for lemma in synset.lemma_names():
                        if "_" not in lemma:
                            features['synset({})'.format(lemma)] = (
                                lemma in document_words)
    return features
Beispiel #46
0
def create_training_examples(statement_list: List[List[str]],
                             trees,
                             training_spans,
                             training: bool,
                             max_span_length,
                             num_sentence_words,
                             args,
                             pos_tags,
                             constituents,
                             k=1):
    """
    This function defines a training set for span prediction.
    I computes features (inputs) for eveyspans.
    Multiple spans are extracted for a given statement, and features are computed
    for each.
    :input statement_list: list of token lists.
    :input trees: parse tree list, aligned with statement_list
    :training: bool, indicates whether annotation (True/False) is computed.
    :k: int, number of negative span examples per positive. Default: 1 (1:1 ratio)
    """
    print('building features...')

    word_frequencies = FreqDist([w.lower() for w in reuters.words()])
    # list of science tokens & science multiword expressions
    science_tokens = get_science_terms(args.data_path)
    science_expressions = get_science_terms(args.data_path, False)
    stop_words = set(stopwords.words('english'))

    true_examples, false_examples, examples_per_sentence = [], [], []
    span_indexes = []

    # loop over all statements
    for i, statement in enumerate(statement_list):
        print(i)

        span_index = []  #enumerating all spans for this sentence.
        pos_tags_this_statement = pos_tag(statement)
        sentence_word_frequencies = [word_frequencies.freq(token.lower()) \
                        for token in statement]
        tree = trees[i]
        false_examples_this_instance = []
        sentence_candidate_span_examples = []  # when training.

        # loop across different spans for given sentence
        for span in legal_spans(num_sentence_words,
                                max_span_length):  #globally legal
            if span[1] > len(statement):
                continue
            if span[0] > len(statement):
                break
            span_index.append(span)

            # extract scalar features for this span. [position will not be used]
            f_bias = 1
            f_length = span[1] - span[0]
            #f_begin = span[0]
            #f_end = span[1]
            #f_dist_to_end0 = len(statement) - span[0]
            #f_dist_to_end1 = len(statement) - span[1]

            # list of tokens of this span
            span_tokens = statement[span[0]:span[1]]

            # feature: span contains at least one science token
            f_science_token = bool( \
                set(span_tokens).intersection(science_expressions))

            f_science_token_count = 0  # counting # of science tokens in span
            max_token_length = 0  # in this span.
            for token in span_tokens:
                f_science_token_count += int(token in science_tokens)
                max_token_length = max(max_token_length, len(token))

            f_max_token_length = np.log(max_token_length)

            # feature: relative word frequency average
            # with numerical stability/ avoiding -inf
            f_avg_word_frequency = 1e-10 + np.mean(
                sentence_word_frequencies[span[0]:span[1]])
            f_avg_word_frequency = np.log(f_avg_word_frequency)

            # feature: begin with stop word?
            f_stop_word_begin = bool(span_tokens[0] in stop_words)

            # POS indicator (one-hot)
            f_pos = np.zeros([len(pos_tags)])

            # Bag-of-POS-tags for this span.
            for token, tag in pos_tags_this_statement[span[0]:span[1]]:
                f_pos[pos_tags.index(tag)] += 1.0

            # feature: POS indicator for span beginning
            f_pos_beginning = np.zeros([len(pos_tags)])
            f_pos_beginning[pos_tags.index(
                pos_tags_this_statement[span[0]][1])] = 1.0

            # feature: POS indicator for span end
            f_pos_end = np.zeros([len(pos_tags)])
            f_pos_end[pos_tags.index(pos_tags_this_statement[span[1] -
                                                             1][1])] = 1.0

            # feature: POS bigram indicator
            # define extended POS tag set with additional begin and end symbols for bigrams.
            # pos_tags_bigram = pos_tags + ["POS_BEGIN", "POS_END"]

            # for POS bigrams.
            # pos_tags_square = [x for x in product(pos_tags_bigram, pos_tags_bigram)]

            # f_pos_bigram = np.zeros([len(pos_tags_square)])

            # obtaining the POS bigram
            # for position in range(-1, f_length):
            # boundary cases: start of span and end of span.
            #    if position == -1:
            #        tag1 = 'POS_BEGIN'
            #        _, tag2 = pos_tags_this_statement[span[0]]
            #    elif position == f_length -1:
            #        _, tag1 = pos_tags_this_statement[span[0]+position]
            #        tag2 = 'POS_END'
            #    #normal case: inside span.
            #    else:
            #        _, tag1 = pos_tags_this_statement[span[0] + position]
            #        _, tag2 = pos_tags_this_statement[span[0] + position + 1]
            #
            #    f_pos_bigram[pos_tags_square.index( ( tag1, tag2 ) )] += 1.0

            # constituent tree features

            tree_position = tree.treeposition_spanning_leaves(span[0], span[1])

            # smallest subtree in constituent parse, containing this span.
            smallest_subtree = tree[tree_position[:-1]]
            constituent_tag = smallest_subtree.label()

            # feature: is this span a constituent parse subtree span?
            f_span_match = bool(span[1] - span[0] == len(smallest_subtree))

            # constituency parse label indicator
            f_span_constituent = np.zeros([len(constituents)])
            f_span_constituent[constituents.index(constituent_tag)] = 1.0

            # constituency parse label indicator with indication for large spans.
            f_span_constituent_big = np.zeros([len(constituents)])
            f_span_constituent_big[constituents.index(constituent_tag)] = (
                f_length > 2)

            # leave out position features:
            ####  f_begin, f_end, f_dist_to_end0, f_dist_to_end1,

            #now collect all features:
            f_scalars = np.array([
                f_bias, f_span_match, f_length, f_science_token,
                f_avg_word_frequency, f_stop_word_begin, f_max_token_length,
                f_science_token_count
            ])

            # these are all features for this span, in a np array.
            feature_vector = np.concatenate(
                (f_scalars, f_pos, f_pos_beginning, f_pos_end,
                 f_span_constituent, f_span_constituent_big))

            # provide True/False annotation in case the data is used for training.
            if training:
                if span == training_spans[i]:
                    #positive example
                    true_examples.append(feature_vector)
                    sentence_candidate_span_examples.append(
                        (feature_vector, True))
                else:
                    #negative example
                    false_examples_this_instance.append(feature_vector)
                    sentence_candidate_span_examples.append(
                        (feature_vector, False))
            else:
                sentence_candidate_span_examples.append(feature_vector)

        span_indexes.append(span_index)
        examples_per_sentence.append(sentence_candidate_span_examples)

        # select at random k negative spans as training examples. default 1:1
        if training:
            for random_index in np.random.randint(
                    0, len(false_examples_this_instance), k):
                false_examples.append(
                    false_examples_this_instance[random_index])

    print(len(true_examples), 'True span examples.')
    print(len(false_examples), 'False span examples.')

    # collect true and false examples [inputs]
    all_examples = np.concatenate(
        (np.asarray(false_examples), np.asarray(true_examples)))

    # collect annotations for each example (True/False target outputs)
    false_span_labels = np.zeros([len(false_examples)])
    true_span_labels = np.ones([len(true_examples)])
    all_labels = np.concatenate((false_span_labels, true_span_labels))

    return all_examples, all_labels, examples_per_sentence, span_indexes
Beispiel #47
0
print(vocab["barber"])
vocab_size = 5
vocab = vocab.most_common(vocab_size)
vocab
word_to_index = {}
i = 0
for (word, frequency) in vocab:
    i = i + 1
    word_to_index[word] = i
print(word_to_index)

#%% NLTK의 FreqDist 사용하기
from nltk import FreqDist
import numpy as np

vocab = FreqDist(np.hstack(sentences))
print(vocab["barber"])

vocab_size = 5
vocab = vocab.most_common(vocab_size)
vocab
word_to_index = {word[0]: index + 1 for index, word in enumerate(vocab)}
print(word_to_index)

#%% keras의 텍스트 전처리
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
print(tokenizer.word_index)
print(tokenizer.word_counts)
Beispiel #48
0
    nlz_words3 = [
        word for word in nlz_words2
        if not (len(word) == 1 and not is_num(word))
    ]

    return nlz_words3


text = data_collector.load_data('apple', 'keyword')
label = stock_data.stock_price_label('AAPL', 14, 5)
inputs = data_collector.merge_price_text(text, label).values
nlz_inputs = [([word for word in normalizing(words)], tuple(label))
              for (words, label) in inputs]
inputs = [(words, tuple(label)) for (words, label) in inputs]
all_words = list(itertools.chain(*[words for (words, _) in nlz_inputs]))
fd = FreqDist(all_words)
word_features = [word for (word, _) in fd.most_common(2000)]


def features_contain(words):
    '''
    A feature extractor whose features indicate whether or not individual
    words are present in a given words.

    return example:
    {'contain(apple)': True, 'contain(banana)': False, ...}

    :param words: A list of words
    :type words: List
    :return: Features that indicate whether or not individual words are present
    in a given words
Beispiel #49
0
from nltk import FreqDist

text = 'Hello ! This is a course designed for people who are interested in learning the core concepts of NLP and ' \
        'utilising those concepts to make applications to perform sentiment analysis analysis'

# Freq Dist - input list

text_list = text.split(' ')
print(text_list)

freqDist = FreqDist(text_list)
words = list(freqDist.keys())
print(words)

print(freqDist['analysis'])
Beispiel #50
0
def frequent(context):
    freq = FreqDist(context)
    return freq
Beispiel #51
0
positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')

positive_cleaned_tokens = []
negative_cleaned_tokens = []

for tokens in positive_tweet_tokens:
    positive_cleaned_tokens.append(remove_noise(tokens, stop_words))

for tokens in negative_tweet_tokens:
    negative_cleaned_tokens.append(remove_noise(tokens, stop_words))

all_pos_words = get_all_words(positive_cleaned_tokens)

# Get the frequency of words.
freq_dist_pos = FreqDist(all_pos_words)


def get_tweets_for_model(cleaned_tokens_list):
    for tweet_tokens in cleaned_tokens_list:
        yield dict([token, True] for token in tweet_tokens)


# Convert list of words to dictionary with words as keys and True as values
positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens)
negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens)

# Attatch a positive or negative label to each tweet.
positive_dataset = [(tweet_dict, "Positive")
                    for tweet_dict in positive_tokens_for_model]
negative_dataset = [(tweet_dict, "Negative")
Beispiel #52
0
negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')


# In[29]:


# Remove noise (normalize + stop word removal)
positive_cleaned_tokens = remove_noise(positive_tweet_tokens)
negative_cleaned_tokens = remove_noise(negative_tweet_tokens)


# In[30]:


# Word frequency distributions
positive_freq_dist = FreqDist(get_all_words(positive_cleaned_tokens))
negative_freq_dist = FreqDist(get_all_words(negative_cleaned_tokens))
print(positive_freq_dist.most_common(10))
print(negative_freq_dist.most_common(10))


# In[31]:


# Convert data to NLTK-required format
positive_dataset = get_dataset_from_tokens(positive_cleaned_tokens, "Positive")
negative_dataset = get_dataset_from_tokens(negative_cleaned_tokens, "Negative")
dataset = positive_dataset + negative_dataset


# In[32]:
Beispiel #53
0
def cptj(x):#词频统计
    from nltk import FreqDist
    ciping=FreqDist(x)
    return ciping
#!/usr/bin/python3
# coding: utf-8
import nltk
from nltk.corpus import gutenberg  # 导入 gutenberg 集
##################################################################
## FreqDist 跟踪分布中的采样频率 (sample frequencies)
from nltk import FreqDist  # 导入 FreqDist 类
fd = FreqDist(
    gutenberg.words('austen-persuasion.txt'))  # 频率分布实例化, 统计文本中的 Token
print(
    fd
)  # <FreqDist with 51156 samples and 2621613 outcomes>; 可以得到 51156 个 不重复值, 2621613 个 token
print(type(fd))  # <class 'nltk.probability.FreqDist'>
print(fd['the'])  # 3120; 查看 word 出现次数; 默认 FreqDist 是一个字典
print(fd.N())  # 98171; 是单词, 不是字母, 有重复的
print(fd.B()
      )  # 6132; number of bins or unique samples; 唯一单词, bins 表示相同的会在一个 bin 中
print(len(fd.keys()), type(fd.keys()))  # 6132 <class 'dict_keys'>
print(fd.keys())  # fd.B() 只是输出个数, 这个是把所有词汇表输出
print(fd.max())  # 频率最高的一个词
print(fd.freq('the'))  # 0.03178127960395636; 出现频率 3120 / 98171
print(fd.hapaxes())  # ['[', 'Persuasion', 'Jane', ...] 只出现一次的罕用词
# 出现频率最高的大多是一些"虚词", 出现频率极低的(hapaxes)又只能靠上下文来理解; 文本中出现频率最高和最低的那些词往往并不能反映这个文本的特征
for idx, word in enumerate(fd):  # 可以用 enumerate 来遍历, 是按出现顺序排的
    if idx == 5: break
    print(idx, word)  # 0 [; 1 Persuasion; 2 by; 3 Jane; 4 Austen
##################################################################
## 统计词的长度频率
fdist = FreqDist(len(w) for w in gutenberg.words('austen-persuasion.txt'))
print(fdist)  # <FreqDist with 16 samples and 98171 outcomes>
print(
Beispiel #55
0
from nltk.corpus import brown

print(brown.categories())

print(brown.words(categories='news'))
print(brown.words(fileids=['cg22']))
print(brown.sents(categories=['news', 'editorial', 'reviews']))

from nltk import FreqDist
news = brown.words(categories='news')
fdist = FreqDist([w.lower() for w in news])
modals = ['can', 'could', 'may', 'might', 'must', 'will']
for m in modals:
    print('{0}: {1}'.format(m, fdist[m]))
Beispiel #56
0
def lesk_text(t, s):
    # LESK aplicado entre dois textos, para funcionar genericamente
    t_def = FreqDist(prep_text(t).split())
    s_def = FreqDist(prep_text(s).split())
    intersection = (t_def) & (s_def)
    return len(intersection)
texts[:300]

# 불용어 제거

with open('./stopwords.txt', 'r', encoding='UTF-8') as f:
    stopwords = f.read()

stopwords = stopwords.split(' ')
stopwords[:10]

texts = [text for text in texts if text not in stopwords]
# 원본에서 불용어 파일에 존재하지 않는 단어들만 추출하라

import pandas as pd
from nltk import FreqDist
freqtxt = pd.Series(dict(FreqDist(texts))).sort_values(ascending=False)
freqtxt[:25]

# 판다스를 활용하여 상위 빈도 단어를 추출한다

from konlpy.tag import Okt
# stem 어간.. 의미를 가지는 단어
# tag 문법.. 명사, 동사, ...
okt.pos('가치창출')
okt.pos('갤럭시')

# 워드 클라우드 출력

from wordcloud import WordCloud
wcloud = WordCloud(ctx + "D2Coding.ttf",
                   relative_scaling=0.2,
Beispiel #58
0
def load_data(source, dist, vocab_size=10000, limit=None):

    # Reading raw text from source and destination files
    f = open(source, 'r')
    X_data = f.read()
    f.close()
    f = open(dist, 'r')
    y_data = f.read()
    f.close()

    print('raw data read')

    if limit is not None:
        X_data = X_data[:limit]
        y_data = y_data[:limit]

    # Splitting raw text into array of sequences
    X = [
        text_to_word_sequence(x)
        for x, y in zip(X_data.split('\n'), y_data.split('\n'))
        if len(x) > 0 and len(y) > 0
    ]
    y = [
        text_to_word_sequence(y)
        for x, y in zip(X_data.split('\n'), y_data.split('\n'))
        if len(x) > 0 and len(y) > 0
    ]

    # Creating the vocabulary set with the most common words (leaving room for PAD, START, UNK)
    dist = FreqDist(np.hstack(X))
    X_vocab = dist.most_common(vocab_size - len(EXTRA_SYMBOLS))
    dist = FreqDist(np.hstack(y))
    y_vocab = dist.most_common(vocab_size - len(EXTRA_SYMBOLS))

    # Creating an array of words from the vocabulary set, we will use this array as index-to-word dictionary
    X_ix_to_word = [word[0] for word in X_vocab]
    # Adding the word "ZERO" to the beginning of the array
    X_ix_to_word = EXTRA_SYMBOLS + X_ix_to_word

    # Creating the word-to-index dictionary from the array created above
    X_word_to_ix = {word: ix for ix, word in enumerate(X_ix_to_word)}

    # print(X_word_to_ix['<PAD>'])
    # print(X_word_to_ix['the'])
    # print(X_word_to_ix['session'])
    # print(X_word_to_ix['resumption'])

    # Converting each word to its index value
    for i, sentence in enumerate(X):
        for j, word in enumerate(sentence):
            if word in X_word_to_ix:
                X[i][j] = X_word_to_ix[word]
            else:
                X[i][j] = X_word_to_ix['<UNK>']

    # for s in range(3):
    #     print('___ ', ' '.join(X_ix_to_word[id] for id in X[s]))

    y_ix_to_word = [word[0] for word in y_vocab]
    y_ix_to_word = EXTRA_SYMBOLS + y_ix_to_word

    y_word_to_ix = {word: ix for ix, word in enumerate(y_ix_to_word)}

    for i, sentence in enumerate(y):
        for j, word in enumerate(sentence):
            if word in y_word_to_ix:
                y[i][j] = y_word_to_ix[word]
            else:
                y[i][j] = y_word_to_ix['<UNK>']

    return X, len(X_vocab)+2, X_word_to_ix, X_ix_to_word, \
           y, len(y_vocab)+2, y_word_to_ix, y_ix_to_word
def suggest():

    #get  language
    lang = request.args.get('lang', 'en')
    import nltk
    nltk.download('punkt')

    if lang == 'en':
        word_column_names = ['Count', 'Word']

    if lang == 'fr':
        word_column_names = ['Nombre', 'Mot']

	    #get url
    url = request.args.get('url', 'https://www.canada.ca/en.html')

    #get the html from the URL
    import requests
    r = requests.get(url)
    html = r.text

    #get the html content as text - get content from the "main" tag
    from bs4 import BeautifulSoup
    original_soup = BeautifulSoup(html, features="lxml").find('main')
    original_text = original_soup.get_text()
    original_text = original_text.replace('..', '.')
    original_text = original_text.replace('.', '. ')
    original_text = original_text[:original_text.find("defPreFooter")]
    original_text = original_text.replace('\n', '')
    original_text = original_text.replace('\t', '')
    original_text = original_text.replace('\r', '')

    #get initial readability total_score
    from readability import Readability
    r_o = Readability(original_text)
    original_fk = r_o.flesch_kincaid()
    original_score = original_fk.score
    original_score = format(original_score, '.2f')

    #add periods after bullet points and headings so that the Flesch Kicaid score considers them as sentences
    html1 = html.replace("</li>", ".</li>")
    html2 = html1.replace("</h1>", ".</h1>")
    html3 = html2.replace("</h2>", ".</h2>")
    html4 = html3.replace("</h3>", ".</h3>")
    html5 = html4.replace("</h4>", ".</h4>")
    html6 = html5.replace("</h5>", ".</h5>")
    html7 = html6.replace("</h6>", ".</h6>")

    #get adjusted readability total_score
    revised_soup = BeautifulSoup(html7, features="lxml").find('main')
    for t in revised_soup.select('table'):
        t.extract()
    revised_text = revised_soup.get_text()
    revised_text = revised_text.replace('..', '.')
    revised_text = revised_text .replace('.', '. ')
    revised_text = revised_text[:revised_text.find("defPreFooter")]
    revised_text = revised_text.replace('\n', '')
    revised_text = revised_text.replace('\t', '')
    revised_text = revised_text.replace('\r', '')

    from readability import Readability
    r_f = Readability(revised_text)
    final_fk = r_f.flesch_kincaid()


    #tokenize the text for processing

    from nltk.tokenize import RegexpTokenizer
    tokenizer = RegexpTokenizer('\w+')
    tokens = tokenizer.tokenize(revised_text)
    words = []
    for word in tokens:
        words.append(word.lower())

    #remove stop words from the tokens to get only the meaningful words
    nltk.download('stopwords')
    sw_en = nltk.corpus.stopwords.words('english')
    words_ns_en = []
    for word in words:
        if word not in sw_en:
            words_ns_en.append(word)

    #get the 15 most used words in the text
    from nltk import FreqDist
    fdist1_en = FreqDist(words_ns_en)
    most_common_en = fdist1_en.most_common(20)
    mc_en = pd.DataFrame(most_common_en, columns =['Word', 'Count'])
    mc_en = mc_en[['Count', 'Word']]


    sw_fr = nltk.corpus.stopwords.words('french')
    words_ns_fr = []
    for word in words:
        if word not in sw_fr:
            words_ns_fr.append(word)

    #get the 15 most used words in the text
    from nltk import FreqDist
    fdist1_fr = FreqDist(words_ns_fr)
    most_common_fr = fdist1_fr.most_common(20)
    mc_fr = pd.DataFrame(most_common_fr, columns =['Mot', 'Nombre'])



    #get all headings and calculate how many words on average between headings
    headings = original_soup.findAll(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
    len_headings = len(headings)
    hratio = len(words)/(len(headings))

    #get all paragraphs and all bulleted list, and calculate how many words per paragraph on average
    paragraphs = original_soup.findAll(['p', 'ul'])
    len_par = len(paragraphs)
    pratio = (len(words)/len(paragraphs))

    #calculate points for readability
    if final_fk.score <= 6:
        fkpoints = 60
    elif final_fk.score >= 18:
        fkpoints = 0
    else :
        fkpoints = (60-((final_fk.score-6)*5))

    #calculate points for number of words between headings
    if hratio <= 40:
        hpoints = 20
    elif hratio >= 200:
        hpoints = 0
    else :
        hpoints = (20-((hratio-40)*0.125    ))

    #calculate points for number of words per paragraph
    if pratio <= 30:
        ppoints = 20
    elif pratio >= 80:
        ppoints = 0
    else :
        ppoints = (20-((pratio-30)*0.4))

    #add all points
    total_score = fkpoints+hpoints+ppoints
    total_score = format(total_score, '.2f')
    fkpoints = format(fkpoints, '.2f')
    final_fk_score = format(final_fk.score, '.2f')
    hpoints = format(hpoints, '.2f')
    hratio = format(hratio, '.2f')
    ppoints = format(ppoints, '.2f')
    pratio = format(pratio, '.2f')
    total_words = len(words)

    total_score = float(total_score)
    if total_score >= 90:
        if lang=='en':
            score = 'Outstanding!'
        if lang=='fr':
            score = 'Excellent!'
    elif total_score >= 80 and total_score < 90:
        if lang=='en':
            score = 'Very good!'
        if lang=='fr':
            score = 'Très bien!'
    elif total_score >= 70 and total_score < 80:
        if lang=='en':
            score = 'Not too bad'
        if lang=='fr':
            score = 'Pas mal'
    elif total_score >= 60 and total_score < 70:
        if lang=='en':
            score = 'Needs work'
        if lang=='fr':
            score = 'À travailler'
    elif total_score >= 50 and total_score < 60:
        if lang=='en':
            score = 'Needs a lot of work'
        if lang=='fr':
            score = 'Besoin de beaucoup de travail'
    elif total_score < 50:
        if lang=='en':
            score = "Please don't do this to people..."
        if lang=='fr':
            score = "S'il vous plaît, il faut faire quelque chose..."

    if lang == "en":
        return render_template("read_score_en.html", total_score = total_score, fkpoints = fkpoints, final_fk_score = final_fk_score, hpoints = hpoints, hratio = hratio, ppoints = ppoints, pratio = pratio, total_words = total_words, url = url, lang = lang, word_column_names = word_column_names, row_data_word_en = list(mc_en.values.tolist()), row_data_word_fr = list(mc_fr.values.tolist()), zip = zip, score = score, len_headings = len_headings, len_par = len_par, original_score = original_score)

    if lang == "fr":
        return render_template("read_score_fr.html", total_score = total_score, fkpoints = fkpoints, final_fk_score = final_fk_score, hpoints = hpoints, hratio = hratio, ppoints = ppoints, pratio = pratio, total_words = total_words, url = url, lang = lang, word_column_names = word_column_names, row_data_word_en = list(mc_en.values.tolist()), row_data_word_fr = list(mc_fr.values.tolist()), zip = zip, score = score, len_headings = len_headings, len_par = len_par, original_score = original_score)
def wordcloud_generator(text):
    nltk.download('stopwords')
    nltk.download('wordnet')
    nltk.download('averaged_perceptron_tagger')
    nltk.download('movie_reviews')
    nltk.download('punkt')

    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(text)
    print("tokens created...")

    stop_words = stopwords.words('english')
    filtered_token = []
    for w in tokens:
        if w not in stop_words and len(w) > 3:
            filtered_token.append(w)
    print("stop words removed...")

    lemmatizer = WordNetLemmatizer()
    lemmatized_filtered_token = []
    for w in filtered_token:
        if len(w) > 3:
            lemmatized_filtered_token.append(lemmatizer.lemmatize(w))

    pos_tagged_token = nltk.pos_tag(lemmatized_filtered_token)

    adjective_tokens_0 = []
    for w in pos_tagged_token:
        if w[1] == 'JJ' and len(w[0]) > 3:
            adjective_tokens_0.append(w[0])
    print("Level 1 Adjective sorting done...")

    x = nltk.pos_tag(adjective_tokens_0)

    adjective_tokens_1 = []
    for w in x:
        if w[1] == 'JJ' and len(w[0]) > 3:
            adjective_tokens_1.append(w[0])
    print("Level 2 Adjective sorting done...")

    y = nltk.pos_tag(adjective_tokens_1)

    adjective_tokens_2 = []
    for w in y:
        if w[1] == 'JJ' and len(w[0]) > 3:
            adjective_tokens_2.append(w[0])
    print("Level 3 Adjective sorting done...")

    freq_dist = FreqDist(adjective_tokens_2)
    common_words = freq_dist.most_common(50)
    max_freq_list = []
    for w in common_words:
        max_freq_list.append(w[0])
    print(
        "50 most common words selected for colour sorting... Polarity Finding function called..."
    )

    word_polarity(max_freq_list)

    color_to_words = {'#00ff00': pos_word_list, 'red': neg_word_list}
    default_color = 'grey'
    print("Colours associated with given words...")

    grouped_color_func = GroupedColorFunc(color_to_words, default_color)
    print("Calling Wordcloud Creator...")
    myimage = calc_freq(adjective_tokens_2, grouped_color_func)
    print("DISPLAYING THE WORDCLOUD !!")
    plt.figure(figsize=(20, 10), facecolor='k')
    plt.imshow(myimage)
    plt.axis('off')
    plt.show()