Ejemplo n.º 1
0
def condensify(train):
    """
    Takes input either a string or a list of string
    Returns a list of all summaries;
    For a string returns a list with singleton document
    """
    summ_list = []
    if isinstance(train,string):
        train = [train]
    for t in train:
        summ=[]
        k=0
        #corpus = [dictionary.doc2bow(text) for text in texts]
        dictionary = corpora.Dictionary([w for w in reuters.sents(t)])
        corpus = [dictionary.doc2bow(w) for w in reuters.sents(t)]
        matrix = matutils.corpus2csc(corpus)
        #print matrix
        u,sigma,vt = sparse.linalg.svds(matrix)
        (k,l)= vt.shape
        while k>=1:
            if reuters.sents(t)[vt[k-1].argmax()] not in summ:
                summ.append(reuters.sents(t)[vt[k-1].argmax()])
            k-=1
        v=[]
        for s in summ:
            v.append(" ".join(s))
        summ = "".join(v)
        summ_list.append(summ)
    return (summ_list)
Ejemplo n.º 2
0
def import_reuters_flat_pos(ds, silent=False, log=sys.stdout):
    """
    Import the brown corpus into `ds`. E.g.
    
    >>> from nathan.core import Dataspace
    >>> ds = Dataspace()
    >>> %time brown.import_brown(ds, silent=True)
    CPU times: user 12min 28s, sys: 536 ms, total: 12min 29s
    Wall time: 12min 29s
    """
    
    tagger = nltk.data.load("./models/treebank_brill_aubt/treebank_brill_aubt.pickle")
    
    if not silent:
        total = len(reuters.sents())
        counter = 0
    root_handle = ds.insert("#reuters")
    for sent in reuters.sents():
        sent = tagger.tag(sent)
        norm = [nltk.tuple2str(t) for t in sent]
        sen_handle = ds.insert(norm)
        ds.link(root_handle, sen_handle)
        if not silent:
            counter += 1
            if (counter % 100 == 0):
                print("importing %s of %s sentences..." % (counter, total), 
                    file=log)
Ejemplo n.º 3
0
def condensify(train):
    """
    Takes input either a string or a list of string
    Returns a list of all summaries;
    For a string returns a list with singleton document
    """
    summ_list = []
    if isinstance(train, string):
        train = [train]
    for t in train:
        summ = []
        k = 0
        #corpus = [dictionary.doc2bow(text) for text in texts]
        dictionary = corpora.Dictionary([w for w in reuters.sents(t)])
        corpus = [dictionary.doc2bow(w) for w in reuters.sents(t)]
        matrix = matutils.corpus2csc(corpus)
        #print matrix
        u, sigma, vt = sparse.linalg.svds(matrix)
        (k, l) = vt.shape
        while k >= 1:
            if reuters.sents(t)[vt[k - 1].argmax()] not in summ:
                summ.append(reuters.sents(t)[vt[k - 1].argmax()])
            k -= 1
        v = []
        for s in summ:
            v.append(" ".join(s))
        summ = "".join(v)
        summ_list.append(summ)
    return (summ_list)
Ejemplo n.º 4
0
def getSummaries(cfs, number_of_summaries):
    for n in range(number_of_summaries):
        # get a random article from the corpus
        article = random.choice(reuters.fileids())
        # make sure the article is of the apropriate length
        # I decided at least 5 sentences!
        while len(reuters.sents(article)) < 5:
            article = random.choice(reuters.fileids())

        length = len(reuters.sents(article)) // 2
        summary_sentences = cfs.summarize(article, length)
        print_summary(summary_sentences)
Ejemplo n.º 5
0
    def load_reuters(self):

        sents = reuters.sents()
        print "Done loading reuters, cleaning..."
        # clean, etc...
        data = []
        for sentence in sents:
            x = []
            for word in sentence:
                if word in self.punctset:
                    continue
                w = word.lower().strip(self.punctuation)
                ind = self.word2id.get(w, -1)
                if ind < 0:
                    ind = len(self.word2id)
                    self.word2id[w] = ind
                x.append(ind)
            data.append(x)
        print "Done cleaning reuters, vectorizing..."

        X = sparse.lil_matrix((
            len(self.word2id),
            len(data),
        ))

        for (j, dat) in enumerate(data):
            for i in dat:
                X[i, j] = 1

        return X
Ejemplo n.º 6
0
def language_mode():
    # 读取语料库 from NLTK
    categories = reuters.categories()
    corpus = reuters.sents(categories=categories)
    print(f'top 3 corpus is:\n {corpus[:3]}')
    term_count = {}
    bigram_count = {}
    for doc in corpus:
        doc = ['<s>'] + doc
        for i in range(0, len(doc) - 1):
            #bigram :[i,i +1]
            term = doc[i]
            bigram = doc[i:i + 2]
            if term in term_count:
                term_count[term] += 1
            else:
                term_count[term] = 1
            bigram = ' '.join(bigram)
            if bigram in bigram_count:
                bigram_count[bigram] += 1
            else:
                bigram_count[bigram] = 1
    print(f'term_count length is: \n {len(term_count)}'
          )  # {'<s>': 54716, 'ASIAN': 12, 'EXPORTERS': 46, 'FEAR'
    print(
        f'bigram_count length is: \n {len(bigram_count)}'
    )  #{'<s> ASIAN': 4, 'ASIAN EXPORTERS': 1, 'EXPORTERS FEAR': 1, 'FEAR DAMAGE': 1,
    return term_count, bigram_count
Ejemplo n.º 7
0
def reuters_to_df(set_name, label_to_idx):

    data = [x for x in reuters.fileids() if set_name in x]

    # collect all data to create df from
    all_texts = [
        " ".join([" ".join(sen) for sen in reuters.sents(doc_id)])
        for doc_id in data
    ]

    all_labels = np.zeros((len(all_texts), len(label_to_idx)))
    all_label_indices = [[
        label_to_idx[lab] for lab in reuters.categories(doc_id)
    ] for doc_id in data]

    for i, labs in enumerate(all_label_indices):
        # binary encode the labels
        all_labels[i][labs] = 1

    all_labels = all_labels.astype(int)
    # all_labels[all_label_indices] = 1
    cols = ["text"]
    label_cols = ["topic_{}".format(lab) for lab in reuters.categories()]
    cols.extend(label_cols)
    # create df and set values
    df = pd.DataFrame(columns=cols)
    df["text"] = all_texts
    df[label_cols] = all_labels

    return df
Ejemplo n.º 8
0
def import_reuters_files(ds, silent=False, log=sys.stdout):
    """
    Import the brown corpus into `ds`. E.g.
    
    >>> from nathan.core import Dataspace
    >>> ds = Dataspace()
    >>> %time brown.import_brown(ds, silent=True)
    CPU times: user 12min 28s, sys: 536 ms, total: 12min 29s
    Wall time: 12min 29s
    """
    if not silent:
        total = len(reuters.fileids())
        counter = 0
    root_handle = ds.insert("#reuters")
    for fileid in reuters.fileids():
        tags = ["@%s" % category for category in reuters.categories(fileid)]
        file_handle = ds.insert(["#%s" % fileid] + tags)
        ds.link(root_handle, file_handle)
        for sent in reuters.sents(fileid):
            norm = [word.lower() for word in sent]
            sen_handle = ds.insert(norm)
            ds.link(file_handle, sen_handle)
        if not silent:
            counter += 1
            if (counter % 10 == 0):
                print("importing %s of %s files..." % (counter, total), 
                    file=log)
Ejemplo n.º 9
0
def filesForEval(cfs):

    #the number of sentences in each summary to be generated
    length = 5

    #the number of summaries to be written into the file
    number_of_summaries = 10

    #create the file containing the summaries
    summFile = open('random_order_summaries_2.txt', 'w')
    #create the file which will be the key
    keyFile = open('key_2.txt', 'w')

    #generate the appropriate number of summaries
    for n in range(number_of_summaries):
        #get a random article from the corpus
        article = random.choice(reuters.fileids())
        #make sure the article is of the apropriate length
        # I decided at least twice the length of the summary in this case.
        while len(reuters.sents(article)) < length * 4:
            article = random.choice(reuters.fileids())

        #print info about the article into the summary-containing document
        summFile.write('Article #' + str(n) + '\n')
        summFile.write('\nfileid: ' + article + '\n\n')

        #print info about the article into the key document
        keyFile.write('Article #' + str(n) + '\n')
        keyFile.write('fileid: ' + article + '\n\n')

        #get a list of sentences that is the summary generated by our algorithm
        summSents = cfs.summarize(article, length)
        #insert a marker to make sure we remember it is the summary
        summSents.insert(0, 'summ')

        #get a list of sentences that were randomly ordered
        randSents = getRandom(article, length)
        #insert a marker to make sure we remember it is the random selection
        randSents.insert(0, 'rand')

        #mix up the ordering
        summs = [summSents, randSents]
        random.shuffle(summs)

        #Write the summaries into the file and write the key, in a semi-nice format
        for summ in summs:
            for i, sentence in enumerate(summ):
                if i == 0:
                    keyFile.write(sentence)
                else:
                    for word in sentence:
                        summFile.write(word + ' ')
                    summFile.write('\n')
            summFile.write('\n')
            keyFile.write(' ')
        summFile.write('\n')
        keyFile.write('\n')

    summFile.close()
    keyFile.close()
Ejemplo n.º 10
0
 def __init__(self):
     training_files = [
         fileid for fileid in reuters.fileids()
         if fileid.startswith('training')
     ]
     super(ReutersTrainingCorpus,
           self).__init__(reuters.sents(training_files))
def create_raw_data_for_classifier(start_pt, end_pt):
    # printing process id
    #print("ID of process running : {}".format(os.getpid()))

    #df_for_raw = pd.DataFrame(columns=['sentences','polarity'])
    pos = 0
    neg = 0
    polarity_list = []
    sentncs_list = []

    #for i in range(len(reuters.sents())):
    for i in range(start_pt, end_pt):
        sentncs = " ".join(reuters.sents()[i])
        #print("sentncs = ", sentncs)
        blob = TextBlob(sentncs)
        sentncs_list.append(sentncs)
        if blob.sentiment.polarity > 0:
            polarity_list.append('pos')
            pos = pos + 1
        elif blob.sentiment.polarity < 0:
            polarity_list.append('neg')
            neg = neg + 1
    raw_data = list(zip(sentncs_list, polarity_list))
    #print(reutersDf.tail(10))
    print("raw_data len = ", len(raw_data))
    print("Total pos = ", pos, " Total Neg =", neg)
    print(raw_data[0])

    raw_data = list(zip(sentncs_list, polarity_list))
    #print(reutersDf.tail(10))
    print("raw_data len = ", len(raw_data))
    print("Total pos = ", pos, " Total Neg =", neg)
    print(raw_data[0])
    return raw_data
Ejemplo n.º 12
0
def reuters_idf_dict(current_docs, file_name, order=2):
    """
    """
    idf_file = file_name + ".idf"
    dict_idf = {}
    if os.path.exists(idf_file):
        with open(idf_file, 'r', encoding='utf-8') as f:
            for line in f:
                values = line.split("\t")
                # print(values)
                dict_idf[tuple(values[0].split())] = float(values[1])
        dict_idf = make_concept_idf_dict(current_docs, dict_idf,
                                         len(reuters.fileids()))
        return dict_idf
    else:
        logger.info("Process reuters idf.")
        l_docs = []
        for fileid in reuters.fileids():
            l_docs.append(reuters.sents(fileids=[fileid]))
        dict_idf = make_concept_idf_dict(l_docs, order=order)
        with open(idf_file, 'w', encoding='utf-8') as f:
            for concept in dict_idf.keys():
                f.write(' '.join(concept) + '\t' + str(dict_idf[concept]) +
                        '\n')
        dict_idf = make_concept_idf_dict(current_docs, dict_idf,
                                         len(reuters.fileids()))
        return dict_idf
Ejemplo n.º 13
0
    def __init__(self, vocab_path: os.PathLike, spell_error_path: os.PathLike):
        # 构建词典库
        with open(vocab_path) as f:
            self.vocab = {line.strip() for line in f}
        self.vocab_size = len(self.vocab)

        # 获取语料库, 构建语言模型
        categories = reuters.categories()
        corpus = reuters.sents(categories=categories)
        self.unigram_count, self.bigram_count = defaultdict(int), defaultdict(
            int)
        for doc in corpus:
            doc = ['<s>'] + doc
            for i in range(1, len(doc)):
                self.unigram_count[doc[i]] += 1
                self.bigram_count[(doc[i - 1], doc[i])] += 1

        # 统计拼写错误概率 P(mistake|correct)
        self.channel_prob = defaultdict(dict)
        with open(spell_error_path) as f:
            for line in f:
                temp = line.split(':')
                correct = temp[0].strip()
                mistakes = [m.strip() for m in temp[1].strip().split(',')]
                for m in mistakes:
                    self.channel_prob[correct][m] = 1. / len(mistakes)
Ejemplo n.º 14
0
def initCount2():
    corpus_raw_text = reuters.sents(categories=reuters.categories())
    gram_count = {}
    count = [0, 0, 0]
    for sents in corpus_raw_text:
        sents = ['<s>'] + sents + ['</s>']
        # remove string.punctuation
        for words in sents[::]:  # use [::] to remove the continuous ';' ';'
            if (words in [
                    '\'\'', '``', ',', '--', ';', ':', '(', ')', '&', '\'',
                    '!', '?', '.'
            ]):
                sents.remove(words)

        # count the n-gram
        for n in range(1, 3):  # only compute 1/2/3-gram
            if (len(sents) <= n):  # 'This sentence is too short!'
                continue
            else:
                for i in range(n, len(sents) + 1):
                    gram = sents[i - n:i]  # ['richer', 'fuller', 'life']
                    key = ' '.join(gram)  # richer fuller life
                    count[n] = count[n] + 1
                    if (key in gram_count):  # use dict's hash
                        gram_count[key] += 1
                    else:
                        gram_count[key] = 1
    with open("Count.pk", "wb") as fCount:
        pickle.dump([gram_count], fCount)
    return gram_count, count[0], count[1], count[2]
Ejemplo n.º 15
0
def getreuters():
	# don't tokenize
	from nltk.corpus import reuters
	reuterslist=[]
	for article in reuters.fileids():
		reuterslist.append(reuters.sents(article))
	return reuterslist
def get_tokenized_sentences(dataset):
    if dataset == 'brown_corpus':
        return list(brown.sents())

    elif dataset == 'reuters_corpus':
        return list(reuters.sents())

    elif dataset == 'gatsby':
        with open('./data/gatsby.txt', 'r') as f:
            text = '\n'.join(f.readlines())
        tok_sent = [word_tokenize(t) for t in sent_tokenize(text)]
        return tok_sent

    elif dataset == 'RACE_corpus':
        df_1 = pd.read_csv('./data/middle_combined.csv')
        df_2 = pd.read_csv('./data/high_combined.csv')
        text = '\n'.join(list(df_1['text']) + list(df_2['text']))
        # This dataset appears to have an issue with period spacing
        text = text.replace(".", ". ")
        tok_sent = [word_tokenize(t) for t in sent_tokenize(text)]
        return tok_sent

    elif dataset in ('news_small', 'news_large'):
        df = pd.read_csv(f'./data/all_the_{dataset}.csv')
        text = '\n'.join(list(df['content']))
        text = text.replace("   ", " ")
        text = text.replace("   ", " ")
        tok_sent = [word_tokenize(t) for t in sent_tokenize(text)]
        return tok_sent

    elif dataset.startswith("books"):
        # Get all books
        if dataset == 'books':
            text = ""
            dif = ["middle", "high", "college"]
            for d in dif:
                files = glob.glob(f"./data/books/{d}/*.txt")
                for file in files:
                    with open(file, 'r') as f:
                        text += "\n".join(f.readlines()) + "\n"
        # Get all books of specified difficulty
        else:
            difficulty = dataset.split("_")[1]
            files = glob.glob(f"./data/books/{difficulty}/*.txt")
            text = ""
            for file in files:
                with open(file, 'r') as f:
                    text += "\n".join(f.readlines()) + "\n"
        # Underscores are used to indicate italics here and should be dropped.
        text = text.replace("_", "")
        tok_sent = [word_tokenize(t) for t in sent_tokenize(text)]
        return tok_sent

    # Assume .txt document
    else:
        with open(dataset, 'r') as f:
            text = "\n".join(f.readlines())
        tok_sent = [word_tokenize(t) for t in sent_tokenize(text)]
        return tok_sent
Ejemplo n.º 17
0
def load_reuters():
    from nltk.corpus import reuters
    text = reuters.sents()
    text = [[word.lower() for word in sentence] for sentence in text]
    vocab = Vocab.build(text,
                        reserved_tokens=[BOS_TOKEN, EOS_TOKEN, PAD_TOKEN])
    corpus = [vocab.convert_tokens_to_idx(sentence) for sentence in text]
    return corpus, vocab
Ejemplo n.º 18
0
def q4(query):
    print("\n" + "~"*10 + " Q4 " + "~"*10)
    
    # retrieve seventh sentence and join into a string
    # also remove punctuation from it since they are separated out
    pick = " ".join( remove_punc(word) for word in reuters.sents()[7] if word not in punc)
    result = (pick, jaccard(query, pick))
    print(result)
Ejemplo n.º 19
0
def q1():
    global docs

    # retrieve sentences from three genres
    docs = list(reuters.sents(categories='bop'))
    docs = docs + (reuters.sents(categories='cocoa'))
    docs = docs + (reuters.sents(categories='zinc'))
    #print(docs)

    # convert list of lists into list of strings
    doc = []
    for sent in docs:
        # case fold each sentence
        doc.append(" ".join([
            word.lower() for word in sent
            if word not in set(string.punctuation)
        ]))
    docs = doc
Ejemplo n.º 20
0
def get_corpus():
    """To get nltk corpora."""
    from nltk.corpus import brown
    from nltk.corpus import reuters

    corpus = add_sent_marker(brown.sents())
    corpus = corpus + add_sent_marker(reuters.sents())
    corpus = add_sent_marker(corpus)
    return corpus
Ejemplo n.º 21
0
def train_skip(CBOW_skip=0, embed_size=100, window=5, min_count=5, epochs=5, workers=1):
    corpus_name = 'Brown'
    corpus = itertools.chain(reuters.sents(), brown.sents(), gutenberg.sents())
    corpus = list(corpus)
    model = Word2Vec(corpus,sg=CBOW_skip, size=embed_size, window=window, min_count=min_count, workers=workers)
    logging.warning("[1]"+",".join([corpus_name,str(CBOW_skip),str(embed_size),str(window)]))
    model.train(corpus, total_examples=len(corpus), epochs=epochs)
    model = gensim_to_embed(model)
    return wordsim_eval(model)
Ejemplo n.º 22
0
def get_reuters_token_list_by_sentence(num_doc=100):
    """ Get a test data from reuters corpus.
    Stopwords will be included to see how HMM_LDA works with these stopwords.

    Parameters
    ----------
    num_doc: int
        number of documents to be returned
    max_voca
        maximum number of vocabulary size for the returned corpus
    Returns
    -------
    voca: ndarray
        vocabulary
    corpus: list
        nested list of

    """
    file_list = reuters.fileids()
    corpus = [reuters.sents(file_list[i]) for i in xrange(num_doc)]

    valid_voca = set(w.lower() for w in nltk.corpus.words.words())
    stop = stopwords.words('english')
    valid_voca = valid_voca.union(stop)

    tmp_corpus = list()
    voca_dic = dict()
    voca = list()
    for doc in corpus:
        tmp_doc = list()
        for sent in doc:
            tmp_sent = list()
            for word in sent:
                if word in valid_voca:
                    tmp_sent.append(word)
                    if word not in voca_dic:
                        voca_dic[word] = len(voca_dic)
                        voca.append(word)
            if len(tmp_sent) > 0:
                tmp_doc.append(tmp_sent)
        if len(tmp_doc) > 0:
            tmp_corpus.append(tmp_doc)

    # convert token list to word index list
    corpus = list()
    for doc in tmp_corpus:
        new_doc = list()
        for sent in doc:
            new_sent = list()
            for word in sent:
                new_sent.append(voca_dic[word])
            new_doc.append(new_sent)
        corpus.append(new_doc)

    return np.array(voca), corpus
Ejemplo n.º 23
0
def get_reuters_token_list_by_sentence(num_doc=100):
    """ Get a test data from reuters corpus.
    Stopwords will be included to see how HMM_LDA works with these stopwords.

    Parameters
    ----------
    num_doc: int
        number of documents to be returned
    max_voca
        maximum number of vocabulary size for the returned corpus
    Returns
    -------
    voca: ndarray
        vocabulary
    corpus: list
        nested list of

    """
    file_list = reuters.fileids()
    corpus = [reuters.sents(file_list[i]) for i in xrange(num_doc)]

    valid_voca = set(w.lower() for w in nltk.corpus.words.words())
    stop = stopwords.words('english')
    valid_voca = valid_voca.union(stop)

    tmp_corpus = list()
    voca_dic = dict()
    voca = list()
    for doc in corpus:
        tmp_doc = list()
        for sent in doc:
            tmp_sent = list()
            for word in sent:
                if word in valid_voca:
                    tmp_sent.append(word)
                    if word not in voca_dic:
                        voca_dic[word] = len(voca_dic)
                        voca.append(word)
            if len(tmp_sent) > 0:
                tmp_doc.append(tmp_sent)
        if len(tmp_doc) > 0:
            tmp_corpus.append(tmp_doc)

    # convert token list to word index list
    corpus = list()
    for doc in tmp_corpus:
        new_doc = list()
        for sent in doc:
            new_sent = list()
            for word in sent:
                new_sent.append(voca_dic[word])
            new_doc.append(new_sent)
        corpus.append(new_doc)

    return np.array(voca), corpus
Ejemplo n.º 24
0
def create_model_from_NLTK():
    filepath = "nltkcorpus.txt"
    if isfile(filepath):
        return create_model(filepath= filepath, save=False)
    else:
        from nltk.corpus import reuters, brown, gutenberg
        sents = reuters.sents() + brown.sents()
        for gsents in [gutenberg.sents(fid) for fid in gutenberg.fileids()]:
            sents += gsents

        return create_model(sentences=sents, savename=filepath)
Ejemplo n.º 25
0
def create_corpus():
    """Removing punctuations from the sentences of corpus."""
    # Here, I am taking a text from rueters, webtext and brown corpus here.
    rr_corpus = reuters.sents() + webtext.sents() + brown.sents()
    punctuations = [p for p in string.punctuation]
    cleaned_corpus = []

    for idx in range(len(rr_corpus)):
        cleaned_corpus.append(
            [w for w in rr_corpus[idx] if w not in punctuations])

    return cleaned_corpus
Ejemplo n.º 26
0
def _load_reuters_docs():
    test_docs = []
    train_docs = []

    i = 0
    for fileid in reuters.fileids():
        i += 1
        if 'test' in fileid:
            # test_docs.append((reuters.words(fileid), reuters.sents(fileid)))
            test_docs.append(
                TokenizedDoc(reuters.words(fileid), reuters.sents(fileid),
                             reuters.categories(fileid)))
        elif 'training' in fileid:
            # train_docs.append((reuters.words(fileid), reuters.words(fileid)))
            train_docs.append(
                TokenizedDoc(reuters.words(fileid), reuters.sents(fileid),
                             reuters.categories(fileid)))
        else:
            print(
                "Document not recognized as part of training-set or test-set while extracting the Reuters Corpus"
            )
    return train_docs, test_docs
Ejemplo n.º 27
0
def load_reuters_corpus() -> List[str]:
    nltk.download('reuters')
    sentences = list(
        filter(
            lambda sent: (len(sent) <= 30) and
            (len(sent) >= 3) and any(map(lambda word: word.isalpha(), sent))
            and len(list(filter(lambda word2: word2.isupper(), sent))) <
            (len(sent) // 4), reuters.sents()))
    mdetok = TreebankWordDetokenizer()
    return list(
        map(
            lambda sent: mdetok.detokenize(
                (' '.join(sent).replace('``', '"').replace("''", '"').replace(
                    '`', "'")).split()), sentences))
Ejemplo n.º 28
0
def process_reuters():
    print 'reuters'
    from nltk.corpus import reuters
    count = 0
    word = 'bank'
    sen1 = 'depository_financial_institution.n.01'
    sen2 = 'bank.n.01'
    file_name = 'data/bank_reuters_tmp.txt'
    for f in reuters.fileids():
        sents = reuters.sents(f)
        for i in range(len(sents)):
            sent = sents[i]
            if (word in sent):
                appendToFile(file_name, sentToStr(sent, '0'))
                count = count + 1
                print count
def pre_process():
    raw_sentences = reuters.sents(reuters.fileids())
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(raw_sentences)
    count_thresh = 5
    low_count_words = [
        w for w, c in tokenizer.word_counts.items() if c < count_thresh
    ]

    for w in low_count_words:
        del tokenizer.word_index[w]
        del tokenizer.word_docs[w]
        del tokenizer.word_counts[w]

    word_index_dict = tokenizer.word_index
    index_word_dict = {word_index_dict[word]: word for word in word_index_dict}
    sentences_word_index = tokenizer.texts_to_sequences(raw_sentences)
    return sentences_word_index, word_index_dict, index_word_dict
Ejemplo n.º 30
0
def get_default_sentences() -> list:
    nltk.download('brown')
    brown_tokenized_sentences = brown.sents()
    brown_sentences = detok_sentences(brown_tokenized_sentences)
    nltk.download('gutenberg')
    nltk.download('punkt')
    gutenberg_tokenized_sentences = gutenberg.sents()
    gutenberg_sentences = detok_sentences(gutenberg_tokenized_sentences)
    nltk.download('reuters')
    reuters_tokenized_sentences = reuters.sents()
    reuters_sentences = detok_sentences(reuters_tokenized_sentences)
    nltk.download('webtext')
    webtext_tokenized_sentences = webtext.sents()
    webtext_sentences = detok_sentences(webtext_tokenized_sentences)
    nltk.download('inaugural')
    inaugural_tokenized_sentences = inaugural.sents()
    inaugural_sentences = detok_sentences(inaugural_tokenized_sentences)
    return brown_sentences + gutenberg_sentences + reuters_sentences + webtext_sentences + inaugural_sentences
Ejemplo n.º 31
0
    def N_gramprediction(self, input_text):
        from nltk.corpus import reuters
        from nltk import bigrams, trigrams
        from collections import Counter, defaultdict
        import random

        # Create a placeholder for model
        model = defaultdict(lambda: defaultdict(lambda: 0))
        # Count frequency of co-occurance
        for sentence in reuters.sents():
            for w1, w2, w3 in trigrams(sentence, pad_right=True,
                                       pad_left=True):
                model[(w1, w2)][w3] += 1

        # Let's transform the counts to probabilities
        for w1_w2 in model:
            total_count = float(sum(model[w1_w2].values()))
            for w3 in model[w1_w2]:
                model[w1_w2][w3] /= total_count

        # starting word
        text1 = str(input_text)
        text = list(text1.split(' '))
        sentence_finished = False
        print(text1)
        while not sentence_finished:
            # select a random probability threshold
            r = random.random()
            accumulator = .0

            for word in model[tuple(text[-2:])].keys():
                accumulator += model[tuple(text[-2:])][word]
                # select words that are above the probability threshold
                if accumulator >= r:
                    text.append(word)
                    break

            if text[-2:] == [None, None]:
                sentence_finished = True

        return (' '.join([t for t in text if t]))
Ejemplo n.º 32
0
def getRandom(article, length):
    allSents = list(enumerate(list(reuters.sents(article))))
    print(len(allSents))

    sentences = []
    sentences.append(allSents[0])
    allSents.__delitem__(0)

    for n in range(length):
        sent = random.choice(allSents)
        sentences.append(sent)
        allSents.remove(sent)

    sentences.sort()

    sentList = []
    for sent in sentences:
        sentence = sent[1]
        sentList.append(sentence)

    return sentList
Ejemplo n.º 33
0
def reuters_idf_dict(current_docs, file_name, order=2):
    """
    """
    idf_file = file_name + ".idf"
    dict_idf = {}
    if os.path.exists(idf_file):
        with open(idf_file, 'r') as f:
            for line in f:
                values = line.split("\t")
                dict_idf[tuple(values[0].split())] = values[1]
        return dict_idf
    else:
        l_docs = []
        for doc in current_docs:
            l_docs.append(doc)
        for fileid in reuters.fileids():
            l_docs.append(reuters.sents(fileids=[fileid]))
        dict_idf = make_concept_idf_dict(l_docs, order)
        with open(idf_file, 'w') as f:
            for concept in dict_idf.keys():
                f.write(' '.join(concept) + '\t' + str(dict_idf[concept]) +
                        '\n')
        return dict_idf
Ejemplo n.º 34
0
datafile = open(datapath, "r")
datalines = []
for i in range(1000):
    dataline = datafile.readline().split('\t')
    datalines.append(dataline)
datafile.close()

end = time.time()
print("time of loading data", end - start)
# begin correction
start = time.time()

n = 100
fres = open("result.txt", "w")
corpus_raw_text = reuters.sents(categories=reuters.categories())
corpus_text = []
for sents in corpus_raw_text:
    sents = ['<s>'] + sents + ['</s>']
    # remove string.punctuation
    for words in sents[::]:  # use [::] to remove the continuous ';' ';'
        if (words in [
                '\'\'', '``', ',', '--', ';', ':', '(', ')', '&', '\'', '!',
                '?', '.'
        ]):
            sents.remove(words)
    corpus_text.extend(sents)
vocab_corpus = {}.fromkeys(corpus_text).keys()
vocab_corpus = list(vocab_corpus)
vocabCount = len(vocab_corpus)
corpus_str = ' '.join(corpus_text)
Ejemplo n.º 35
0
 def __init__(self, language):
     self.language = language
     # from 'Multilingual and Cross-Lingual Complex Word Identification' (Yimam et. al, 2017)
     if language == 'english':
         self.avg_word_length = 5.3
         # from Beker, Henry; Piper, Fred. Cipher Systems: The Protection of Communications.
         self.char_frequency = {
             'a': 8.167,
             'b': 1.492,
             'c': 2.782,
             'd': 4.253,
             'e': 12.702,
             'f': 2.228,
             'g': 2.015,
             'h': 6.094,
             'i': 6.966,
             'j': 0.153,
             'k': 0.772,
             'l': 4.025,
             'm': 2.406,
             'n': 6.749,
             'o': 7.507,
             'p': 1.929,
             'q': 0.095,
             'r': 5.987,
             's': 6.327,
             't': 9.056,
             'u': 2.758,
             'v': 0.978,
             'w': 2.360,
             'x': 0.150,
             'y': 1.974,
             'z': 0.074
         }
         self.dic = pyphen.Pyphen(lang='en')
         self.reuters = reuters.words()
         self.unigram_counts = Counter(self.reuters)
         bigrams = []
         for sent in reuters.sents():
             bigrams.extend(
                 nltk.bigrams(sent, pad_left=True, pad_right=True))
         self.bigram_counts = Counter(bigrams)
     else:  # spanish
         self.avg_word_length = 6.2
         # self.char_frequency = {'a': 12.525,'b': 2.215,'c': 4.139,'d': 5.860,'e': 13.681,
         #                        'f': 0.692,'g': 1.768,'h': 0.703,'i': 6.247,'j': 0.443,
         #                        'k': 0.011,'l': 4.967,'m': 3.157,'n': 6.71,'o': 8.683,
         #                        'p': 2.510, 'q': 0.877,'r': 6.871,'s': 7.977,'t': 4.632,
         #                        'u': 3.927, 'v': 1.138,'w': 0.017,'x': 0.215,'y': 1.008,
         #                        'z': 0.517,'á': 0.502, 'é': 0.433, 'í': 0.725, 'ñ': 0.311,
         #                        'ó': 0.827, 'ú': 0.168, 'ü': 0.012}
         # self.dic = pyphen.Pyphen(lang='es')
         self.cess = cess.words()
         self.unigram_counts = Counter(self.cess)
         bigrams = []
         for sent in cess.sents():
             bigrams.extend(
                 nltk.bigrams(sent, pad_left=True, pad_right=True))
         self.bigram_counts = Counter(bigrams)
     # self.clf = svm.SVC()
     # self.model = LogisticRegression()
     self.model = svm.SVC(gamma=5)
Ejemplo n.º 36
0
#!/usr/local/bin/python
import nltk, re, pprint
import triple.py
from nltk.corpus import reuters
from nltk.sem import relextract,extract_rels,rtuple

grammar = "Relation: {<DT>?<JJ>*<NN><V.*><NN>}"


cp = nltk.RegexpParser(grammar)
s = [nltk.pos_tag(s) for s in reuters.sents()[:30]]

#print sentence
#print cp.parse(sentence)
#nltk.ne_chunk


brown = nltk.corpus.brown #
for sent in s:
	tree = cp.parse(sent)
	for subtree in tree.subtrees():
   		if subtree.label() == 'Relation':
   			print(tree) 
   			print(subtree.leaves())
'''#

IN = re.compile(r'.*\bof\b.*')

for i,sent in enumerate(s):
	sent = nltk.ne_chunk(sent)
	rels = extract_rels('PERSON','ORGANIZATION',doc=sent,corpus='ace',pattern=IN,window=7)
Ejemplo n.º 37
0
        self.logger.info("returning top terms")
        return [term for term, ig in term_ig[:k]]

    def top_common_words(self, k):
        self.logger.info("calculating top %d of %d word terms according to frequency", k, len(self._total_freq))

        # terms = self.all_terms()
        # terms_freq = [(term, sum((term.frequency(doc) for doc in self._documents))) for term in terms]
        terms_freq = sorted(self._total_freq.items(), key=lambda x: x[1], reverse=True)

        self.logger.info("returning top %d word terms according to frequency", k)

        return [term for term, freq in terms_freq[:k]]


if __name__ == '__main__':
    training_fileids = fileids = filter(lambda x: "training" in x, reuters.fileids())
    documents = reuters.sents(training_fileids)
    # dict = set(reuters.words(training_fileids))

    # print documents[0]
    # print " ".join(documents[0])
    # print WordTerm("in").frequency(documents[0])

    print 'Checking Vectorizer'
    documents = get_document_objects(documents)
    w = WordTermExtractor(documents, None)
    print documents[0].get_freq('BAHIA')
    print documents[0].get_freq('bahia')

    print ProjectParams.terms_matrix.total_freq
Ejemplo n.º 38
0
	print "Processed {0} sentences\r".format(processed_count),
print "Current Structure total: {0}".format(len(sentences))

print "Adding abc sentence structures ({0})...".format(len(abc.sents()))
for sentence in abc.sents():
	processed_count += 1
	try:
		blob = TextBlob(filter(lambda x: x in string.printable, " ".join(sentence)), pos_tagger=PerceptronTagger())
		tags = tuple([tag[1] for tag in blob.tags])
		sentences.add(tags)
	except:
		print "\r",
	print "Processed {0} sentences\r".format(processed_count),
print "Current Structure total: {0}".format(len(sentences))

print "Adding reuters sentence structures ({0})...".format(len(reuters.sents()))
for sentence in reuters.sents():
	processed_count += 1
	try:
		blob = TextBlob(filter(lambda x: x in string.printable, " ".join(sentence)), pos_tagger=PerceptronTagger())
		tags = tuple([tag[1] for tag in blob.tags])
		sentences.add(tags)
	except:
		print "\r",
	print "Processed {0} sentences\r".format(processed_count),
print "Current Structure total: {0}".format(len(sentences))

print "Adding brown sentence structures ({0})...".format(len(brown.sents()))
for sentence in brown.sents():
	processed_count += 1
	try:
Ejemplo n.º 39
0
        
        
        
"""
There are three options to train the true caser:
1) Use the sentences in NLTK
2) Use the train.txt file. Each line must contain a single sentence. Use a large corpus, for example Wikipedia
3) Use Bigrams + Trigrams count from the website http://www.ngrams.info/download_coca.asp

The more training data, the better the results
"""
         

# :: Option 1: Train it based on NLTK corpus ::
print "Update from NLTK Corpus"
NLTKCorpus = brown.sents()+reuters.sents()+nltk.corpus.semcor.sents()+nltk.corpus.conll2000.sents()+nltk.corpus.state_union.sents()
updateDistributionsFromSentences(NLTKCorpus, wordCasingLookup, uniDist, backwardBiDist, forwardBiDist, trigramDist)

# :: Option 2: Train it based the train.txt file ::
""" #Uncomment, if you want to train from train.txt
print "Update from train.txt file"
sentences = []
for line in open('train.txt'):        
    sentences.append(line.strip())
    
tokens = [nltk.word_tokenize(sentence) for sentence in sentences]
updateDistributionsFromSentences(tokens, wordCasingLookup, uniDist, backwardBiDist, forwardBiDist, trigramDist)
"""     
   
# :: Option 3: Train it based ngrams tables from http://www.ngrams.info/download_coca.asp ::    
""" #Uncomment, if you want to train from train.txt
Ejemplo n.º 40
0
    # def terminals(self, term, document):
    #     return (
    #         self.bool(term, document), self.tf(term, document), self.tf_idf(term, document), self.tf_ig(term, document),
    #         self.tf_chi(term, document), self.tf_rf(term, document))

    def raw_terminals(self, term, document):
        return (self.bool(term, document), self.tf(term, document), self.max_prob_term_and_category(term, document),
                self.max_prob_term_not_category(term, document), self.avg_prob_term_category(term, document),
                self.avg_prob_term_not_category(term, document), self.first_occurrence_perc(term, document))


if __name__ == '__main__':
    training_fileids = fileids = filter(lambda fileid: "training" in fileid and len(reuters.categories(fileid)) == 1,
                                        reuters.fileids())
    documents = [sum(reuters.sents(fid), []) for fid in training_fileids]
    doc = documents[0]
    term = terminals.WordTerm("in")
    docs_categories = [reuters.categories(fid)[0] for fid in training_fileids]
    print docs_categories
    print doc
    fe = TWSCalculator(documents, docs_categories)

    print "tf =", fe.tf(term, doc), "idf =", fe.idf(term), "tf-idf =", fe.tf_idf(term, doc)

    term = terminals.WordTerm("in")

    print 'TF-CHI: ', fe.tf_chi(term, doc)
    print 'TF-CHI: ', fe.tf_chi(term, doc)
    print 'TF-IG: ', fe.tf_ig(term, doc)
    print 'TF-IG: ', fe.tf_ig(term, doc)
Ejemplo n.º 41
0
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from features import TWSCalculator
from readers import NewsgroupsReader
from terminals import get_document_objects, WordTermExtractor, WordTerm
from terms_lists.ng20_ig import ng_20_ig500
from terms_lists.r8_ig import r_eight_terms

__author__ = 'itay'
if __name__ == '__main__':
    cats_limiter = categories = ['earn', 'acq', 'crude', 'trade', 'money-fx', 'interest', 'money-supply',
    'ship']  # top 8
    training_fileids = fileids = filter(lambda fileid: "training" in fileid and len(reuters.categories(fileid)) == 1,
                                        reuters.fileids(cats_limiter))

    training_documents = [" ".join(sum(reuters.sents(fid), [])) for fid in training_fileids]
    training_docs_categories = [reuters.categories(fid)[0] for fid in training_fileids]

    map(lambda x: x.lower, training_documents)

    # training_documents_objects = get_document_objects(training_documents, training_docs_categories)
    # training_documents_objects = NewsgroupsReader(False).get_training()
    # training_documents = [d.doc for d in training_documents_objects]
    # training_docs_categories = [d.category for d in training_documents_objects]

    #top IG r8:
    # words = ng_20_ig500
    # tws_calculator = TWSCalculator(training_documents_objects, training_docs_categories)
    # word_term_extractor = WordTermExtractor(training_documents_objects, tws_calculator)
    #
    # top_terms = word_term_extractor.top_common_words(500)
Ejemplo n.º 42
0
 def __init__(self):
     test_files = [fileid for fileid in reuters.fileids() if fileid.startswith('test')]
     super(ReutersTestCorpus, self).__init__(reuters.sents(test_files))
Ejemplo n.º 43
0
#!/usr/bin/python
#coding:utf-8

# 2013/02/08

from nltk.corpus import reuters
import nltk

# nltk.pos_tag() のタグの種類を調べる
sents=reuters.sents()
tags = set( tag for sent in sents[:5000] for word,tag in nltk.pos_tag(sent) )
print tags
# set(['PRP$', 'VBG', 'VBD', '``', 'VBN', ',', "''", 'VBP', 'WDT', 'JJ', 'WP', 'VBZ', 'DT', 'RP', 'NN', 'POS', '.', 'TO', 'PRP', 'RB', ':', 'NNS', 'NNP', 'VB', 'WRB', 'CC', 'LS', 'PDT', 'RBS', 'RBR', 'CD', '-NONE-', 'EX', 'IN', 'WP$', 'MD', 'NNPS', 'JJS', 'JJR'])
# VB,VBG,VBD,VBN,VBP,VBZ,JJ,JJS,JJR,NN,NNS,NNP,NNPS
# VB : 動詞,基本形
# VBD : 過去形
# VBG : 動名詞,現在分詞
# VBN : 過去分詞
# VBP : 非三単現
# VBZ : 三単現
# JJ : 形容詞
# JJR : 比較級
# JJS : 最上級
# NN : 単数名詞
# NNS : 複数名詞
# NNP : 単数固有名詞
# NNPS : 複数固有名詞
Ejemplo n.º 44
0
 def __init__(self):
     training_files = [fileid for fileid in reuters.fileids() if fileid.startswith('training')]
     super(ReutersTrainingCorpus, self).__init__(reuters.sents(training_files))