Ejemplo n.º 1
0
def reuters_dataset():
    nltk.download('reuters')
    nltk.download('stopwords')
    stop_words = stopwords.words("english")

    documents = reuters.fileids()

    train_docs_id = [doc for doc in documents if doc.startswith("train")]
    test_docs_id = [doc for doc in documents if doc.startswith("test")]

    train_docs = [reuters.raw(doc_id) for doc_id in train_docs_id]
    test_docs = [reuters.raw(doc_id) for doc_id in test_docs_id]

    print(len(train_docs), len(test_docs))

    vectorizer = TfidfVectorizer(stop_words=stop_words)

    vectorised_train_documents = vectorizer.fit_transform(train_docs)
    vectorised_test_documents = vectorizer.transform(test_docs)

    # print([reuters.categories(doc_id) for doc_id in test_docs_id])

    mlb = MultiLabelBinarizer()
    train_labels = mlb.fit_transform(
        [reuters.categories(doc_id) for doc_id in train_docs_id])
    test_labels = mlb.transform(
        [reuters.categories(doc_id) for doc_id in test_docs_id])

    return vectorised_train_documents.toarray(
    ), vectorised_test_documents.toarray(), train_labels, test_labels
Ejemplo n.º 2
0
def load_data(config={}):

    stop_words = stopwords.words("english")
    vectorizer = TfidfVectorizer(stop_words=stop_words, binary=True)
    mlb = MultiLabelBinarizer()

    documents = reuters.fileids()
    test = [d for d in documents if d.startswith('test/')]
    train = [d for d in documents if d.startswith('training/')]

    docs = {}
    docs['train'] = [reuters.raw(doc_id) for doc_id in train]
    docs['test'] = [reuters.raw(doc_id) for doc_id in test]
    xs = {'train': [], 'test': []}
    xs['train'] = vectorizer.fit_transform(docs['train']).toarray()
    xs['test'] = vectorizer.transform(docs['test']).toarray()
    ys = {'train': [], 'test': []}
    ys['train'] = mlb.fit_transform(
        [reuters.categories(doc_id) for doc_id in train])
    ys['test'] = mlb.transform([reuters.categories(doc_id) for doc_id in test])
    data = {
        'x_train': xs['train'],
        'y_train': ys['train'],
        'x_test': xs['test'],
        'y_test': ys['test'],
        'labels': reuters.categories()
    }
    print(data['x_train'])
    print(data['y_train'])
    return data, vectorizer.vocabulary_
Ejemplo n.º 3
0
def collection_stats():
    # List of documents
    documents = reuters.fileids()
    print(str(len(documents)) + " documents")

    train_docs = list(filter(lambda doc: doc.startswith("train"), documents))
    print(str(len(train_docs)) + " total train documents")

    test_docs = list(filter(lambda doc: doc.startswith("test"), documents))
    print(str(len(test_docs)) + " total test documents")

    # List of categories
    categories = reuters.categories()
    print(str(len(categories)) + " categories")

    # Documents in a category
    category_docs = reuters.fileids("acq")

    # Words for a document
    document_id = category_docs[0]
    document_words = reuters.words(category_docs[0])
    print(document_words)

    # Raw document
    print(reuters.raw(document_id))
    tokens = []
    for docid in train_docs:
        t = tokenize(reuters.raw(docid))
        tokens.extend(t)
    print(tokens[0])
    v = set(tokens)
    print("number of terms=", len(tokens))
    print("voc size=", len(v))
Ejemplo n.º 4
0
def load_data():
    """
    Load the Reuters dataset.

    Returns
    -------
    train_docs, train_labels, test_docs, test_labels.
    """
    documents = reuters.fileids()
    train = [d for d in documents if d.startswith('training/')]
    train_docs = [reuters.raw(doc_id) for doc_id in train]
    train_docs = [text_prepare(x) for x in train_docs]
    train_labels = [reuters.categories(doc_id) for doc_id in train]

    test = [d for d in documents if d.startswith('test/')]
    test_docs = [reuters.raw(doc_id) for doc_id in test]
    test_docs = [text_prepare(x) for x in test_docs]
    test_labels = [reuters.categories(doc_id) for doc_id in test]

    print("len(train_docs)={}, len(train_labels)={}".format(
        len(train_docs), len(train_labels)))
    print("len(test_docs)={}, len(test_labels)={}".format(
        len(test_docs), len(test_labels)))

    mlb = MultiLabelBinarizer(classes=sorted(labels))
    train_labels = mlb.fit_transform(train_labels)
    test_labels = mlb.fit_transform(test_labels)
    print("y_train.shape={}, y_test.shape={}".format(train_labels.shape,
                                                     test_labels.shape))

    return (train_docs, train_labels, test_docs, test_labels, mlb.classes)
def get_reuters():
    # Documents in a category
    category_trade = reuters.fileids("trade")
    category_money = reuters.fileids("money-fx") + reuters.fileids(
        "money-supply")
    category_interest = reuters.fileids("interest")

    text_trade = []
    for i in range(0, len(category_trade)):
        text_trade.append(reuters.raw(category_trade[i]))

    text_money = []
    for i in range(0, len(category_money)):
        text_money.append(reuters.raw(category_money[i]))

    text_interest = []
    for i in range(0, len(text_interest)):
        text_interest.append(reuters.raw(text_interest[i]))

    trade_data = pd.DataFrame(data={'text': text_trade})
    trade_data['category'] = 'TRADE'
    money_data = pd.DataFrame(data={'text': text_money})
    money_data['category'] = 'MONEY'
    interest_data = pd.DataFrame(data={'text': text_interest})
    interest_data['category'] = 'INTEREST'

    # picking only relevant columns
    selected_columns = ['text', 'category']
    df = trade_data
    df = df.append(money_data)
    df = df.append(interest_data)

    return df
Ejemplo n.º 6
0
def get_word2idx():
    import cli
    config = cli.config
    docs, label_seqs, decode_inp, seq_len = load_hclf_reuters(config, "train")
    docs_train = [tokenize(reuters.raw(doc_id)) for doc_id in docs]
    docs, label_seqs, decode_inp, seq_len = load_hclf_reuters(config, "test")
    docs_test = [tokenize(reuters.raw(doc_id)) for doc_id in docs]
    docs = docs_train + docs_test
    max_docs_length = 0

    word2idx = Counter()
    word2idx["UNK"] = 0
    word2idx["NULL"] = 1  # for pad
    idx2word = []
    idx2word += ["UNK", "NULL"]
    for doc in docs:
        max_docs_length = len(
            doc) if len(doc) > max_docs_length else max_docs_length
        for token in doc:
            if token not in word2idx:
                word2idx[token] = len(word2idx)
                idx2word += [token]
    print(len(word2idx))
    #for i in range(len(idx2word)):
    #  print(idx2word[i], word2idx[idx2word[i]])
    shared = {"word2idx": word2idx, "idx2word": idx2word}
    json.dump(shared, open("data/word2idx_new.json", "w"))
Ejemplo n.º 7
0
def load_data(config={}):
    """
    Load the Reuters dataset.

    Returns
    -------
    data : dict
        with keys 'x_train', 'x_test', 'y_train', 'y_test', 'labels'
    """
    stop_words = stopwords.words("english")
    vectorizer = TfidfVectorizer(stop_words=stop_words)
    mlb = MultiLabelBinarizer()

    documents = reuters.fileids()
    test = [d for d in documents if d.startswith('test/')]
    train = [d for d in documents if d.startswith('training/')]

    docs = {}
    docs['train'] = [reuters.raw(doc_id) for doc_id in train]
    docs['test'] = [reuters.raw(doc_id) for doc_id in test]
    xs = {'train': [], 'test': []}
    xs['train'] = vectorizer.fit_transform(docs['train']).toarray()
    xs['test'] = vectorizer.transform(docs['test']).toarray()
    ys = {'train': [], 'test': []}
    ys['train'] = mlb.fit_transform([reuters.categories(doc_id)
                                     for doc_id in train])
    ys['test'] = mlb.transform([reuters.categories(doc_id)
                                for doc_id in test])
    data = {'x_train': xs['train'], 'y_train': ys['train'],
            'x_test': xs['test'], 'y_test': ys['test'],
            'labels': globals()["labels"]}
    return data
Ejemplo n.º 8
0
def prepareArticles(documentsIds=reuters.fileids()):
    """
    Organizes articles in a dictionary structure
    @params documentsIds: a list of document ids - by default get reuter's documents ids
    @rtype {Dictionary}
    """
    allDocs = []
    for id in documentsIds:
        doc = {
            'title':
            getDocTitle(reuters.raw(fileids=id)),
            'size':
            len(reuters.raw(fileids=id)),
            'text':
            removeTitleFromText(
                reuters.raw(fileids=id).replace('\n', ''),
                getDocTitle(reuters.raw(fileids=id))),
            'id':
            id
        }
        allDocs.append(doc)
    sortedDocs = sorted(allDocs, key=lambda x: x['size'])
    suitableDocs = [
        doc for doc in sortedDocs
        if doc['size'] >= 1000 and doc['size'] <= 2000
    ]
    suitableDocsIds = [
        at(doc, 'id', 'title', 'text') for doc in suitableDocs
        if len(nltk.sent_tokenize(doc['text'])) > 3
    ]
    return suitableDocsIds
Ejemplo n.º 9
0
def extract_subset_data(seed=1337):
    train_data = {}
    test_data = {}
    random.seed(seed)
    # np.random.choice()
    for (label, train_amount, test_amount) in LABELS:
        train_category_id = list(
            filter(lambda x_train: x_train.startswith('train'),
                   reuters.fileids(label)))  # list of ids in train category
        random.shuffle(train_category_id)
        train_data[label] = [
            preprocessing(reuters.raw(train))
            for train in train_category_id[:train_amount]
        ]  # processed subset

        test_category_id = list(
            filter(lambda x_test: x_test.startswith('test'),
                   reuters.fileids(label)))  # list of ids in test category
        random.shuffle(test_category_id)
        test_data[label] = [
            preprocessing(reuters.raw(test))
            for test in test_category_id[:test_amount]
        ]  # processed subset

    return train_data, test_data
Ejemplo n.º 10
0
def getDocIDs_top10():
    # Top 10 Categories
    documents = [
        f for f in reuters.fileids() if len(reuters.categories(fileids=f)) == 1
    ]
    train_docs_id = list(
        filter(
            lambda doc: doc.startswith("train") and len(reuters.raw(doc)) > 51,
            documents))
    test_docs_id = list(
        filter(
            lambda doc: doc.startswith("test") and len(reuters.raw(doc)) > 51,
            documents))
    new_train_docs_id = []
    new_test_docs_id = []
    for cat in reuters.categories():
        li = [f for f in reuters.fileids(categories=cat) if f in train_docs_id]
        li_te = [
            f for f in reuters.fileids(categories=cat) if f in test_docs_id
        ]
        if len(li) > 20 and len(li_te) > 20:
            new_train_docs_id.extend(li)
            new_test_docs_id.extend(li_te)
    train_docs_id = new_train_docs_id
    test_docs_id = new_test_docs_id
    return (train_docs_id, test_docs_id)
Ejemplo n.º 11
0
def load_data(config={}):
    """
    Load the Reuters dataset.

    Returns
    -------
    data : dict
        with keys 'x_train', 'x_test', 'y_train', 'y_test', 'labels'
    """
    stop_words = stopwords.words("english")
    vectorizer = TfidfVectorizer(stop_words=stop_words)
    mlb = MultiLabelBinarizer()

    documents = reuters.fileids()
    test = [d for d in documents if d.startswith('test/')]
    train = [d for d in documents if d.startswith('training/')]

    docs = {}
    docs['train'] = [reuters.raw(doc_id) for doc_id in train]
    docs['test'] = [reuters.raw(doc_id) for doc_id in test]
    xs = {'train': [], 'test': []}
    xs['train'] = vectorizer.fit_transform(docs['train']).toarray()
    xs['test'] = vectorizer.transform(docs['test']).toarray()
    ys = {'train': [], 'test': []}
    ys['train'] = mlb.fit_transform(
        [reuters.categories(doc_id) for doc_id in train])
    ys['test'] = mlb.transform([reuters.categories(doc_id) for doc_id in test])
    data = {
        'x_train': xs['train'],
        'y_train': ys['train'],
        'x_test': xs['test'],
        'y_test': ys['test'],
        'labels': globals()["labels"]
    }
    return data
Ejemplo n.º 12
0
def get_raw_data():
    
    nltk.download("reuters")
    from nltk.corpus import reuters
    
    documents = reuters.fileids()
    train_docs_id = list(filter(lambda doc: doc.startswith("train"),
                                documents))
    test_docs_id = list(filter(lambda doc: doc.startswith("test"),
                               documents))
    X_train = [(reuters.raw(doc_id)) for doc_id in train_docs_id]
    X_test = [(reuters.raw(doc_id)) for doc_id in test_docs_id]


    mlb = MultiLabelBinarizer()
    y_train = [reuters.categories(doc_id)
                                 for doc_id in train_docs_id]
    y_test = [reuters.categories(doc_id)
                            for doc_id in test_docs_id]

    all_dataa     =    X_train +  X_test
    all_lavelsa   =    y_train +  y_test



    mlb = MultiLabelBinarizer()
    datas_y = mlb.fit_transform(all_lavelsa)
    
    return all_dataa,all_lavelsa
def get_data_splits():
    train_docs, train_labels = zip(*[(reuters.raw(i), reuters.categories(i))
                                     for i in reuters.fileids()
                                     if i.startswith('training/')])
    test_docs, test_labels = zip(*[(reuters.raw(i), reuters.categories(i))
                                   for i in reuters.fileids()
                                   if i.startswith('test/')])
    return train_docs, train_labels, test_docs, test_labels
Ejemplo n.º 14
0
def main():
    train_docs = []  # contains train document numbers
    test_docs = []  # contains test document numbers
    train_category_docs = {}  # contains category corresponding train documents
    test_category_docs = {}  # contains category corresponding test documents
    train_data = {}  # contains train document numbers corresponding data
    test_data = {}  # contains test document numbers corresponding data

    categories = reuters.categories()  # Total categories list

    #print categories

    #print "Category Name" + " <------------------> " +  "No of Train documents in each Category"
    with open("category_train_docs.csv", "wb") as f:
        writer = csv.writer(f, quoting=csv.QUOTE_ALL)
        for category_name in categories:
            category_docs = reuters.fileids(category_name)
            #print category_name + " <------------------> " + str(len(category_docs))
            train_list = []
            test_list = []
            for category_id in category_docs:
                if category_id.startswith("train"):
                    train_list.append(category_id.split('/')[1])

                else:
                    test_list.append(category_id.split('/')[1])
            writer.writerow([category_name] + train_list)
            #test_category_docs[category_name] = test_list
            #train_category_docs[category_name] = train_list

    exit()

    for doc_id in reuters.fileids():
        if doc_id.startswith("train"):
            train_docs.append(doc_id)
            train_data[doc_id] = tokenize(reuters.raw(doc_id))
            doc_number = doc_id.split('/')[1]
            build_index_train(tokenize(reuters.raw(doc_id)), doc_number)
            #train_docs.append(reuters.raw(doc_id))
        else:
            test_docs.append(doc_id)
            test_data[doc_id] = tokenize(reuters.raw(doc_id))
            doc_number = doc_id.split('/')[1]
            build_index_test(tokenize(reuters.raw(doc_id)), doc_number)

    #print train_data

    with open("inverted_train_index.csv", "wb") as f:
        writer = csv.writer(f, quoting=csv.QUOTE_ALL)
        for words in inverted_index_train:
            if len(inverted_index_train[words]) >= 3:
                inverted_index_train_pruned[words] = (
                    inverted_index_train[words])
                writer.writerow([words] + inverted_index_train_pruned[words])

    for words in inverted_index_test:
        if len(inverted_index_test[words]) >= 3:
            inverted_index_test_pruned[words] = inverted_index_test[words]
Ejemplo n.º 15
0
    def fetch_raw(self, dummy_input):
        # only applicable for raw dataset
        if self.name != self.base_name:
            return None
        info("Downloading raw {} dataset".format(self.name))
        if not self.nltk_dataset_resource_exists(Reuters.name):
            nltk_download(self.config, "reuters")
        # get ids
        categories = reuters.categories()
        self.num_labels = len(categories)
        self.label_names = []
        # train / test labels
        samples = {}
        train_docs, test_docs = [], []
        doc2labels = {}

        # get content
        for cat_index, cat in enumerate(categories):
            samples[cat] = [0, 0]

            # get all docs in that category
            for doc in reuters.fileids(cat):
                # document to label mappings
                if doc not in doc2labels:
                    # not encountered: init document label list
                    doc2labels[doc] = []
                    if doc.startswith("training"):
                        train_docs.append(doc)
                    else:
                        test_docs.append(doc)
                # count samples
                if doc.startswith("training"):
                    samples[cat][0] += 1
                else:
                    samples[cat][1] += 1
                # append the label
                doc2labels[doc].append(cat_index)

        doc2labels, label_set = self.delete_no_sample_labels(
            samples, doc2labels)

        self.train, self.test = [], []
        self.train_labels, self.test_labels = [], []
        # assign label lists
        for doc in train_docs:
            self.train.append(reuters.raw(doc))
            self.train_labels.append(doc2labels[doc])
        for doc in test_docs:
            self.test.append(reuters.raw(doc))
            self.test_labels.append(doc2labels[doc])

        self.label_names = label_set
        # self.labelset = list(sorted(set(self.train_labels)))
        self.roles = "train", "test"
        info("Loaded {} train & {} test instances.".format(
            len(self.train), len(self.test)))
        return self.get_all_raw()
Ejemplo n.º 16
0
 def _extract(ids):
     X_train = [
         re.sub('[ \t\n]+', ' ', reuters.raw(i)) for i in ids
         if 'train' in i
     ]
     X_test = [
         re.sub('[ \t\n]+', ' ', reuters.raw(i)) for i in ids if 'test' in i
     ]
     return X_train, X_test
Ejemplo n.º 17
0
    def __init__(self, min_eic=5):

        self.test_classes = []
        self.test_docs = []
        self.train_classes = []
        self.train_docs = []
        self.table_of_classes = []
        self.num_of_instances = []

        # mininimal encounter in classes
        self.min_eic = min_eic

        if Path("training_cache/train_docs").is_file() and Path("training_cache/train_classes").is_file() \
                and Path("training_cache/test_docs").is_file() and Path("training_cache/test_classes").is_file() \
                and Path("classify_cache/table_of_classes").is_file():
            self.train_docs = joblib.load("training_cache/train_docs")
            self.train_classes = joblib.load("training_cache/train_classes")

            self.test_docs = joblib.load("training_cache/test_docs")
            self.test_classes = joblib.load("training_cache/test_classes")

            self.table_of_classes = joblib.load(
                "classify_cache/table_of_classes")
        else:
            raw_test_classes = []
            raw_train_classes = []

            for doc_id in reuters.fileids():
                if doc_id.startswith("train"):
                    self.train_docs.append(
                        prepare_text_for_analysis(reuters.raw(doc_id)))
                    raw_train_classes.append(reuters.categories(doc_id))
                else:
                    self.test_docs.append(
                        prepare_text_for_analysis(reuters.raw(doc_id)))
                    raw_test_classes.append(reuters.categories(doc_id))

            self.make_table_of_classes(raw_train_classes)
            self.train_classes = self.transform_classes(
                raw_train_classes, "train")
            self.test_classes = self.transform_classes(raw_test_classes,
                                                       "test")

            joblib.dump(self.train_docs,
                        "training_cache/train_docs",
                        compress=9)
            joblib.dump(self.train_classes,
                        "training_cache/train_classes",
                        compress=9)
            joblib.dump(self.test_docs, "training_cache/test_docs", compress=9)
            joblib.dump(self.test_classes,
                        "training_cache/test_classes",
                        compress=9)
            joblib.dump(self.table_of_classes,
                        "classify_cache/table_of_classes",
                        compress=9)
def load_test_data():
	for id in test_corn_ids:
		test_corn_target.append(0)
		test_corn.append(reuters.raw(id))
	for id in test_wheat_ids:
		test_wheat_target.append(1)
		test_wheat.append(reuters.raw(id))
	test = test_corn + test_wheat
	test_target = test_corn_target + test_wheat_target
	return test, test_target		
Ejemplo n.º 19
0
def reuters_dataset(directory='../data',
                    train=True,
                    test=False,
                    clean_txt=False):
    """
    Load the Reuters-21578 dataset.

    Args:
        directory (str, optional): Directory to cache the dataset.
        train (bool, optional): If to load the training split of the dataset.
        test (bool, optional): If to load the test split of the dataset.

    Returns:
        :class:`tuple` of :class:`torchnlp.datasets.Dataset` or :class:`torchnlp.datasets.Dataset`:
        Returns between one and all dataset splits (train and test) depending on if their respective boolean argument
        is ``True``.
    """

    # nltk.download('reuters', download_dir=directory)
    if directory not in nltk.data.path:
        nltk.data.path.append(directory)

    doc_ids = reuters.fileids()

    ret = []
    splits = [
        split_set
        for (requested, split_set) in [(train, 'train'), (test, 'test')]
        if requested
    ]

    for split_set in splits:

        split_set_doc_ids = list(
            filter(lambda doc: doc.startswith(split_set), doc_ids))
        examples = []

        for id in split_set_doc_ids:
            if clean_txt:
                text = clean_text(reuters.raw(id))
            else:
                text = ' '.join(word_tokenize(reuters.raw(id)))
            labels = reuters.categories(id)

            examples.append({
                'text': text,
                'label': labels,
            })

        ret.append(Dataset(examples))

    if len(ret) == 1:
        return ret[0]
    else:
        return tuple(ret)
Ejemplo n.º 20
0
def main():
    train_docs = []
    test_docs = []
    for doc_id in reuters.fileids():
        if doc_id.startswith("train"):
            train_docs.append(reuters.raw(doc_id))
        else:
            test_docs.append(reuters.raw(doc_id))
    representer = tf_idf(train_docs)
    for doc in test_docs:
        print(feature_values(doc, representer))
Ejemplo n.º 21
0
def main():
    collection_stats()

    print("Staring classifier ..")

    X_train = list()
    X_test = list()

    y_train = list()
    y_test = list()

    print("Reading training and testing data ..")

    for doc_id in reuters.fileids():
        if doc_id.startswith("train"):
            X_train.append(reuters.raw(doc_id))
            y_train.append(reuters.categories(doc_id))
        else:
            X_test.append(reuters.raw(doc_id))
            y_test.append(reuters.categories(doc_id))

    X_train = numpy.array(X_train)
    y_train = numpy.array(y_train)
    X_test = numpy.array(X_test)
    y_test = numpy.array(y_test)

    binarizer = MultiLabelBinarizer(classes=reuters.categories())

    classifier = Pipeline([
        ('vectorizer',
         TfidfVectorizer(tokenizer=tokenize,
                         min_df=0,
                         max_df=0.90,
                         max_features=3000,
                         use_idf=True,
                         sublinear_tf=True)),
        # ('tfidf', TfidfTransformer()),
        ('clf', OneVsRestClassifier(LogisticRegression()))
    ])
    print("Training classifier ..")
    classifier.fit(X_train, binarizer.fit_transform(y_train))
    print("Testing classifier ..")
    res = classifier.predict(X_test)

    hard_precision = classifier.score(X_test, binarizer.transform(y_test))

    precision = average_precision_score(res,
                                        binarizer.fit_transform(y_test),
                                        average=None)
    recall = recall_score(res, binarizer.fit_transform(y_test), average=None)
    f1score = f1_score(res, binarizer.fit_transform(y_test), average=None)
    print("Hard precision: " + str(hard_precision))

    log_results(reuters.categories(), precision, recall, f1score)
Ejemplo n.º 22
0
def main():
    train_docs = []
    test_docs = []

    for doc_id in reuters.fileids():
        if doc_id.startswith("train"):
            train_docs.append(reuters.raw(doc_id))
        else:
            test_docs.append(reuters.raw(doc_id))

    representer = tf_idf(train_docs);
def load_train_data():
	train = []
	train_target = []
	for id in train_corn_ids:
		train_corn_target.append(0)
		train_corn.append(reuters.raw(id))
	for id in train_wheat_ids:
		train_wheat_target.append(1)
		train_wheat.append(reuters.raw(id))
	train = train_corn + train_wheat
	train_target = train_corn_target + train_wheat_target
	return train, train_target
Ejemplo n.º 24
0
def getDocIDs_90():
    # 90 Categories
    documents = reuters.fileids()
    train_docs_id = list(
        filter(
            lambda doc: doc.startswith("train") and len(reuters.raw(doc)) > 51,
            documents))
    test_docs_id = list(
        filter(
            lambda doc: doc.startswith("test") and len(reuters.raw(doc)) > 51,
            documents))
    return (train_docs_id, test_docs_id)
Ejemplo n.º 25
0
    def get_default_split():
        documents = reuters.fileids()
        train_docs_id = list(
            filter(lambda doc: doc.startswith("train"), documents))
        test_docs_id = list(
            filter(lambda doc: doc.startswith("test"), documents))

        X_train = [reuters.raw(doc_id) for doc_id in train_docs_id]
        X_test = [reuters.raw(doc_id) for doc_id in test_docs_id]
        Y_train = [reuters.categories(doc_id) for doc_id in train_docs_id]
        Y_test = [reuters.categories(doc_id) for doc_id in test_docs_id]

        return X_train, Y_train, X_test, Y_test
Ejemplo n.º 26
0
def load_data(valid_percent=0.1):
    """
    Load the Reuters dataset.

    Returns:
        raw text and raw labels for train, valid, test set.
    """

    nltk.download('reuters')
    n_classes = 90
    labels = reuters.categories()

    documents = reuters.fileids()
    test = [d for d in documents if d.startswith('test/')]
    train = [d for d in documents if d.startswith('training/')]

    docs = {}
    docs['train'] = [reuters.raw(doc_id) for doc_id in train]
    docs['test'] = [reuters.raw(doc_id) for doc_id in test]

    ys = {'train': [], 'test': []}
    ys['train'] = [reuters.categories(doc_id) for doc_id in train]
    ys['test'] = [reuters.categories(doc_id) for doc_id in test]

    # Validation
    n_valid = int(valid_percent * len(ys['train']))
    np.random.seed(5)
    idxs = np.random.choice(len(ys['train']), n_valid, replace=False)
    idx_set = set(idxs)
    docs['valid'] = []
    ys['valid'] = []
    train_docs = []
    train_y = []
    for idx, (x, y) in enumerate(zip(docs['train'], ys['train'])):
        if idx in idx_set:
            docs['valid'].append(x)
            ys['valid'].append(y)
        else:
            train_docs.append(x)
            train_y.append(y)

    data = {
        'x_train': train_docs,
        'y_train': train_y,
        'x_valid': docs['valid'],
        'y_valid': ys['valid'],
        'x_test': docs['test'],
        'y_test': ys['test'],
        'labels': labels
    }
    return data
    def stats(self):
        """
        :return:    Important statistics about the dataset - numbers of documents in different classes with
                    corresponding percentages, as well as vocabulary sizes for every class.
        """
        lt = LemmaTokenizer()
        train_stats = {}
        test_stats = {}

        for c in reuters.categories():
            train_stats[c] = {
                'num_of_docs': 0,
                'percentage': 0.0,
                'words': set([])
            }
            test_stats[c] = {
                'num_of_docs': 0,
                'percentage': 0.0,
                'words': set([])
            }

        for d in self.train:
            c = reuters.categories(d)[0]
            train_stats[c]['num_of_docs'] += 1
            train_stats[c]['words'] |= set(lt.lemma_tokenize(reuters.raw(d)))
        for d in self.test:
            c = reuters.categories(d)[0]
            test_stats[c]['num_of_docs'] += 1
            test_stats[c]['words'] |= set(lt.lemma_tokenize(reuters.raw(d)))

        s_train = sum(train_stats[c]['num_of_docs']
                      for c in train_stats.keys())
        s_test = sum(test_stats[c]['num_of_docs'] for c in test_stats.keys())

        res = ({}, {})

        for c in train_stats.keys():
            if train_stats[c]['num_of_docs'] != 0:
                train_stats[c][
                    'percentage'] = train_stats[c]['num_of_docs'] / s_train
                train_stats[c]['words'] = len(train_stats[c]['words'])
                res[0][c] = train_stats[c]
        for c in test_stats.keys():
            if test_stats[c]['num_of_docs'] != 0:
                test_stats[c][
                    'percentage'] = test_stats[c]['num_of_docs'] / s_test
                test_stats[c]['words'] = len(test_stats[c]['words'])
                res[1][c] = test_stats[c]

        return res
Ejemplo n.º 28
0
def main():
	train_docs = []
	test_docs = []

	for doc_id in reuters.fileids():
		if doc_id.startswith("train"):		
			train_docs.append(reuters.raw(doc_id))
		else:
			test_docs.append(reuters.raw(doc_id))
		
	representer = tf_idf(train_docs);

	for doc in test_docs:
		print(feature_values(doc, representer))
Ejemplo n.º 29
0
def compute_idf():
    words = set()
    for fileid in reuters.fileids():
        tokens = tokenize(reuters.raw(fileid))
        words.update(tokens)
    idf = dict.fromkeys(words, 0)
    for fileid in reuters.fileids():
        tokens = set(tokenize(reuters.raw(fileid)))
        for token in tokens:
            idf[token] += 1
    total = len(reuters.fileids())
    for word in words:
        idf[word] = math.log(total / (1 + idf[word]))
    return idf
Ejemplo n.º 30
0
def load_data():
    docs = reuters.fileids()
    train_ids = [doc for doc in docs if doc.startswith("train")]
    test_ids = [doc for doc in docs if doc.startswith("test")]

    train_data = pd.DataFrame([(reuters.raw(id), reuters.categories(id)[0])
                               for id in train_ids],
                              columns=('text', 'labels'))

    test_data = pd.DataFrame([(reuters.raw(id), reuters.categories(id)[0])
                              for id in test_ids],
                             columns=('text', 'labels'))

    return train_data, test_data
Ejemplo n.º 31
0
 def load(self):
     logger.info('Starting processing reuters dataset.')
     self.df = pd.DataFrame([{
         'doc_id':
         doc_id,
         'abspath':
         str(reuters.abspath(doc_id)),
         'categories': [c + ' ' for c in reuters.categories(doc_id)],
         'headline':
         reuters.raw(doc_id).split('\n', 1)[0],
         'length':
         len(reuters.raw(doc_id))
     } for doc_id in reuters.fileids()])
     logger.info('Finishing processing reuters dataset.')
Ejemplo n.º 32
0
def get_train_test_reauter_data():
    train_docs = []
    test_docs = []

    for doc_id in reuters.fileids():
        if doc_id.startswith("train"):
            train_docs.append(reuters.raw(doc_id))
        else:
            test_docs.append(reuters.raw(doc_id))

    sliceObject = slice(5)
    train_docs = train_docs[sliceObject]
    test_docs = test_docs[sliceObject]
    return train_docs, test_docs
Ejemplo n.º 33
0
def collection_stats():
	# List of documents
	documents = reuters.fileids()
	print(str(len(documents)) + " documents");
	
	train_docs = list(filter(lambda doc: doc.startswith("train"), documents));
	print(str(len(train_docs)) + " total train documents");
	
	test_docs = list(filter(lambda doc: doc.startswith("test"), documents));	
	print(str(len(test_docs)) + " total test documents");

	# List of categories 
	categories = reuters.categories();
	print(str(len(categories)) + " categories");

	# Documents in a category
	category_docs = reuters.fileids("acq");

	# Words for a document
	document_id = category_docs[0]
	document_words = reuters.words(category_docs[0]);
	print(document_words);	

	# Raw document
	print(reuters.raw(document_id));
Ejemplo n.º 34
0
def main():
    train_docs = []
    test_docs = []

    collection_stats()

    for doc_id in reuters.fileids():
        if doc_id.startswith("train"):      
            train_docs.append(reuters.raw(doc_id))
        else:
            test_docs.append(reuters.raw(doc_id))
        
    representer = tf_idf(train_docs);

    classifier = nltk.NaiveBayesClassifier.train (train_docs)
    print nltk.classify.accuracy (classifier, test_docs)
Ejemplo n.º 35
0
def run():

    """Import the Reuters Corpus which contains 10,788 news articles"""

    from nltk.corpus import reuters
    raw_docs = [reuters.raw(fileid) for fileid in reuters.fileids()]

    # Select 100 documents randomly
    rand_idx = random.sample(range(len(raw_docs)), 100)
    raw_docs = [raw_docs[i] for i in rand_idx]

    # Preprocess Documents
    tokenized_docs = [ie_preprocess(doc) for doc in raw_docs]

    # Remove single occurance words
    docs = remove_infrequent_words(tokenized_docs)

    # Create dictionary and corpus
    dictionary = corpora.Dictionary(docs)
    corpus = [dictionary.doc2bow(doc) for doc in docs]

    # Build LDA model
    lda = models.LdaModel(corpus, id2word=dictionary, num_topics=10)
    for topic in lda.show_topics():
        print topic
Ejemplo n.º 36
0
def create_tfidf_data(docs,categories,n=None):
    """
    Crea una struttura [(label,[parole])] parsando il documento
    :param docs: lista dei documenti reuters
    :param categories: nomi delle categorie da considerare
    :param n: numero di documenti da usare
    :return: list
    """
    if n:
        docs = docs[:n]

    cat_num = {}; i = 1
    for c in categories:
        cat_num[c] = i
        i += 1

    y = []
    corpus = []
    for d in docs:
        c = reuters.categories(d)[0]
        if c in categories:
            y.append(getSVMCategory(cat_num[c]))
            corpus.append(reuters.raw(d).lower())

    return y, corpus
Ejemplo n.º 37
0
def format_data(docs, all_categories):
    y = []; corpus = []
    for d in docs:
        current_categories = filter(lambda x: x in all_categories,reuters.categories(d))
        if current_categories:
            y.append(current_categories[0])
            corpus.append(reuters.raw(d).lower())
    return y, corpus
Ejemplo n.º 38
0
 def construct_freq(self, text = None):
     if text == None:
         from nltk.corpus import reuters
         text = reuters.raw()
     #Get rid of \n
     c_text = text.replace('\n','')
     self.letter_freq = nltk.FreqDist(c_text)
     for bigram in nltk.bigrams(c_text):
         self.letter_freq.inc(''.join(bigram))
Ejemplo n.º 39
0
 def __iter__(self):
     """ Generator of docs while collecting ordered structured info. """
     for n, reutersid in enumerate(reuters.fileids()):         # 'training|test/xxxx'
         dataset, _ = reutersid.split('/')       # extract dataset
         if self.dataset in dataset:             # yield only filtered dataset
             if self.categories is not None:
                 top_category = reuters.categories(reutersid)[0]            # grab first category only
                 self.category_mask.append(self.categories[top_category])   # n-th doc -> classid
             yield reuters.raw(reutersid)        # return raw document
Ejemplo n.º 40
0
def get_raw_text(corpus,file_name):
    string=''
    if corpus=='mr':
        from nltk.corpus import movie_reviews
        string = movie_reviews.raw(fileids=file_name)
    else:
        from nltk.corpus import reuters
        string = reuters.raw(fileids=file_name)
    return string
def write_into_new_file(file_name):
    #stopwords_english = set(stopwords.words('english'))
    string = reuters.raw(fileids=file_name)
    list_words = re.split(r'\W+',string)
    new_file_path = new_path+file_name
    file_wr = open(new_file_path, "w")
    for w in list_words:
        if w.isalpha() and len(w)>1 and w.lower() not in stopwords_english:
            file_wr.write(w.lower()+"\n")
    file_wr.close()
def get_list_tokens_nltk(corpus, file_name):
    string=''
    if corpus=='mr':
        from nltk.corpus import movie_reviews
        string = movie_reviews.raw(fileids=file_name)
    else:
        from nltk.corpus import reuters
        string = reuters.raw(fileids=file_name)
    list_words = re.split(r'\W+',string)
    return [w.lower() for w in list_words if w.isalpha() and len(w)>1 and w.lower() not in stopwords_english]
Ejemplo n.º 43
0
    def build_TFIDF_model(self):
        """ Build term-document matrix containing TF-IDF score for each word in each document
            in the Reuters corpus (via NLTK).
        """
        token_dict = {}
        for article in reuters.fileids():
            token_dict[article] = reuters.raw(article)

        # Use TF-IDF to determine frequency of each word in our article, relative to the
        # word frequency distributions in corpus of 11k Reuters news articles.
        self._tfidf = TfidfVectorizer(tokenizer=self.tokenize_and_stem, stop_words='english', decode_error='ignore')
        tdm = self._tfidf.fit_transform(token_dict.values())  # Term-document matrix
Ejemplo n.º 44
0
def create_tfidf_data(docs,n=None):
    """
    Crea una struttura [(label,[parole])] togliendo le stopwords
    e parsando il documento
    :param docs: lista dei documenti reuters
    :param n: numero di documenti da usare
    :return: list
    """
    if n:
        docs = docs[:n]
    y = [reuters.categories(d)[0] for d in docs]
    corpus = [reuters.raw(d).lower() for d in docs]
    return y, corpus
Ejemplo n.º 45
0
def list_doc_topics(doc_test_topics,doc_train_topics):
    """
    creates a list of two-tuples
    that contain a single feature entry and the body text. 
    """    
    ref_docs = []
    ref_docs_test=[]
    for d in doc_train_topics:
        t1=d.split()[1:]
        d0 = rt0.raw(d.split()[0])
        d0= d0.replace('\n','')
        for t in t1:
            d_tup = (t, d0)
            ref_docs.append(d_tup)
            
    for d in doc_test_topics:
        t2=d.split()[1:]
        d00 = rt0.raw(d.split()[0])
        d00= d00.replace('\n','')
        for t in t2:
            d_tup = (t, d00)
            ref_docs_test.append(d_tup)
    return ref_docs,ref_docs_test
Ejemplo n.º 46
0
 def getDocContentById(self, docId):
     ''' 
     Gets the document content by its id
     
     Parameters
     ----------
     docId : string
         The document id.
     
     Returns
     -------
     The document content (unicode)
     '''
     return reuters.raw(docId)
Ejemplo n.º 47
0
def benchmark(storage_class, create=True):
    m = storage_class(4)
    s = Segmenter(m, 3)
    if create:
        m.clear()

    corpus = reuters.raw()

    tokens = list(filter(lambda t: t.category == '', tokeniser_fr(corpus)))[:10000]
    
    if create:
        m.add_sentence(tokens)

    for i in range(1,5000,30):
        print(s.segment(tokens[i:i+30]))
Ejemplo n.º 48
0
def summarize(storyid):
    stopwords = ['a','an','and','are','as','at','be','but','by','for','if','in','into','is','it','no','not','of','on','or','s','such','t','that','the','their','then','there','these','they','this','to','was','will','with']
    text="".join(reuters.raw(storyid).split("\n"))
    dictW=defaultdict(int)
    sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
    ls=sent_detector.tokenize(text.strip())
    for l in ls:
        tokens=tokenizer.tokenize(l)
        for token in tokens:
          if token.lower not stopwords:
              if token not in dictW:
                  dictW[token]=1
              else:
                dictW[token]+=1
    title=ls[:1]
    ls=ls[1:]
    MAX_SUMMARY_SIZE=int(0.20*len(ls))
    ls.sort(key=lambda s: sum((dictW[token] for token in tokenizer.tokenize(s))), reverse=1)
    ls= ls[:MAX_SUMMARY_SIZE]
    ls.sort(lambda s1, s2:text.find(s1)-text.find(s2))
    ls=title+ls
    print "".join(ls)
    print
Ejemplo n.º 49
0
#for use with sklearn 
def myparser(s):
	punc='[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\n ]' #all punc+whtspc+newline
	np=[a for a in re.split(punc,s) if a not in string.punctuation]
	low=[a.lower() for a in np if len(a)>2] #only two-lett words lowered
	nostop=[a for a in low if a not in stopwords.words('english')]
	return [porter.stem(a) for a in nostop if re.findall(r"[^\W\d]",a)]
#imports
from sklearn.feature_extraction.text import TfidfVectorizer
#object instantiation - ignore utf-8 decode errors
vectfidf=TfidfVectorizer(tokenizer=myparser,decode_error='ignore')
#test corpus
#corpus=[reuters.raw('training/9853'),reuters.raw('training/9866')]

#reuters corpus
corpus=[(reuters.raw(fileid),cat) for cat in reuters.categories()
for fileid in reuters.fileids(cat)]
random.seed(1979)
random.shuffle(corpus)
size=int(len(corpus)*0.1)
train_raw=corpus[size:]
test_raw=corpus[:size]
train_raw_data=[a[0] for a in train_raw]
test_raw_data=[a[0] for a in test_raw]


y_train=[a[1] for a in train_raw]
y_test=[a[1] for a in test_raw]
#DO NOT have to turn labels to ints
from sklearn import preprocessing
le=preprocessing.LabelEncoder()
Ejemplo n.º 50
0
    return d

# load or create the character encoding dictionaries
if os.path.exists(char_idx_path):
    with open(char_idx_path, 'rb') as f:
        logger.info('Loading character encodings from "%s"' % char_idx_path)
        idx_to_char = pickle.load(f)
        char_to_idx = pickle.load(f)
        cat_enc = pickle.load(f)
else:
    n_docs = len(reuters.fileids())
    cat_enc = dict((x, i+1) for i, x in enumerate(set(reuters.categories())))

    chars = set()
    for fid in reuters.fileids():
        chars = chars.union(set(reuters.raw(fid).lower()))

    idx_to_char = dict((i, c) for i, c in enumerate(chars))
    char_to_idx = dict((c, i) for i, c in enumerate(chars))

    with open(char_idx_path, 'wb') as f:
        logger.info('Saving character encodings to "%s"' % char_idx_path)
        pickle.dump(idx_to_char, f)
        pickle.dump(char_to_idx, f)
        pickle.dump(cat_enc, f)

if os.path.exists(reuters_enc_path):
    logging.info('Loading reuters encodings from "%s"' % reuters_enc_path)
    np_file = np.load(reuters_enc_path)
    cats = np_file['arr_0']
    docs = np_file['arr_1']
nltk.download('reuters')
nltk.download('punkt')

google_news_word2vec_model_location = 'data/GoogleNews-vectors-negative300.bin.gz'
doc2vec_model_location = 'model/doc2vec-model.bin'
doc2vec_dimensions = 300
classifier_model_location = 'model/classifier-model.bin'

doc2vec = Doc2Vec.load(doc2vec_model_location)

# Convert the categories to one hot encoded categories
labelBinarizer = MultiLabelBinarizer()
labelBinarizer.fit([reuters.categories(fileId) for fileId in reuters.fileids()])

# Convert load the articles with their corresponding categories
train_articles = [{'raw': reuters.raw(fileId), 'categories': reuters.categories(fileId)} for fileId in reuters.fileids() if fileId.startswith('training/')]
test_articles = [{'raw': reuters.raw(fileId), 'categories': reuters.categories(fileId)} for fileId in reuters.fileids() if fileId.startswith('test/')]
shuffle(train_articles)
shuffle(test_articles)

# Convert the articles to document vectors using the doc2vec model
train_data = [doc2vec.infer_vector(word_tokenize(article['raw'])) for article in train_articles]
test_data = [doc2vec.infer_vector(word_tokenize(article['raw'])) for article in test_articles]
train_labels = labelBinarizer.transform([article['categories'] for article in train_articles])
test_labels = labelBinarizer.transform([article['categories'] for article in test_articles])
train_data, test_data, train_labels, test_labels = numpy.asarray(train_data), numpy.asarray(test_data), numpy.asarray(train_labels), numpy.asarray(test_labels)

# Initialize the neural network
model = Sequential()
model.add(Dense(input_dim=doc2vec_dimensions, output_dim=500, activation='relu'))
model.add(Dropout(0.3))
Ejemplo n.º 52
0
#!/usr/bin/python
#coding:utf-8

# 2013/03/01
# トークナイズしてから小文字化するのと, 全文小文字化してからトークナイズするのは同じか?


from nltk.corpus import reuters
#from nlp.clustering.preprocess import preprocess
import nlp.clustering.preprocess.preprocess as preprocess

raw=reuters.raw(fileids=[reuters.fileids()[1]])
docs1 = preprocess.tokenize(raw)
docs1 = preprocess.lower(docs1)
docs2 = preprocess.tokenize(preprocess.lower(raw))
docs1 == docs2

raws=[reuters.raw(fileids=[fid]) for fid in reuters.fileids()]
docs1 = preprocess.tokenize(raws)
docs1 = preprocess.lower(docs1)
docs2 = preprocess.tokenize(preprocess.lower(raws))
docs1 == docs2

docs3 = preprocess.word_tokenize(preprocess.lower(preprocess.sent_tokenize(raws)))
docs1 == docs3
docs2 == docs3

import timeit

setup='''
from nltk.corpus import reuters
Ejemplo n.º 53
0
def load_reuters(setName):
    html = HTMLParser.HTMLParser()
    doc_ids = reuters.fileids()
    cat2all_ids = {}
    cat2train_ids = {}
    cat2test_ids = {}
    cat2all_num = {}
    cand_docNum = 0
    
    for doc_id in doc_ids:
        # only choose docs belonging in one category
        if len( reuters.categories(doc_id) ) == 1:
            cat = reuters.categories(doc_id)[0]
            cand_docNum += 1
            
            if doc_id.startswith("train"):
                cat2set_ids = cat2train_ids
            else:
                cat2set_ids = cat2test_ids
                
            if cat in cat2set_ids:
                cat2set_ids[cat].append(doc_id)
            else:
                cat2set_ids[cat] = [ doc_id ]
            
            # both train and test doc_ids are put in cat2all_ids
            if cat in cat2all_ids:
                cat2all_ids[cat].append(doc_id)
            else:
                cat2all_ids[cat] = [ doc_id ]
            if cat in cat2all_num:
                cat2all_num[cat] += 1
            else:
                cat2all_num[cat] = 1
            
    print "Totally %d docs, %d single-category docs in %d categories" %( len(doc_ids), 
                    cand_docNum, len(cat2train_ids) )
                    
    sorted_cats = sorted( cat2all_num.keys(), key=lambda cat: cat2all_num[cat],
                            reverse=True )
                            
    catNum = 10
    cats_docsWords = [ [] for i in xrange(catNum) ]
    cats_docNames = [ [] for i in xrange(catNum) ]
                            
    topN_cats = sorted_cats[:catNum]
    print "Top 10 categories:"
    keptAllDocNum = 0
    keptTrainDocNum = 0
    keptTestDocNum = 0
    
    for cat in topN_cats:
        print "%s: %d/%d" %( cat, len(cat2train_ids[cat]), len(cat2test_ids[cat]) )
        keptTrainDocNum += len(cat2train_ids[cat])
        keptTestDocNum += len(cat2test_ids[cat])
        keptAllDocNum += len(cat2train_ids[cat]) + len(cat2test_ids[cat])
        
    print "Totally %d docs kept, %d in train, %d in test" %( keptAllDocNum, 
                        keptTrainDocNum, keptTestDocNum )    
    
    if setName == "train":
        cat2set_ids = cat2train_ids
        setDocNum = keptTrainDocNum
    elif setName == "test":
        cat2set_ids = cat2test_ids
        setDocNum = keptTestDocNum
    elif setName == "all":
        cat2set_ids = cat2all_ids
        setDocNum = keptAllDocNum
    else:
        raise Exception("Unknown set name %s" %setName)
            
    orig_docs_name = []
    orig_docs_cat = []
    orig_docs_words = []
    readDocNum = 0
    totalLineNum = 0
    emptyFileNum = 0
    
    for cat_id, cat in enumerate(topN_cats):
        for doc_id in cat2set_ids[cat]:
            if readDocNum % 50 == 49 or readDocNum == setDocNum - 1:
                print "\r%d %d\r" %( readDocNum + 1, totalLineNum ),
            text = html.unescape( reuters.raw(doc_id) )
            text = text.encode("utf-8")
            lines = text.split("\n")
            if len(text) == 0 or len(lines) == 0:
                emptyFileNum += 1
                continue
        
            readDocNum += 1
            totalLineNum += len(lines)
        
            text = " ".join(lines)
            wordsInSentences, wc = extractSentenceWords(text)
            
            filename = doc_id
            orig_docs_words.append( wordsInSentences )
            orig_docs_name.append(filename)
            orig_docs_cat.append(cat_id)
            cats_docsWords[cat_id].append(wordsInSentences)
            cats_docNames[cat_id].append(filename)
            
    print "Done. %d docs read, %d empty docs skipped. Totally %d lines" %(readDocNum, emptyFileNum, totalLineNum)
    return setDocNum, orig_docs_words, orig_docs_name, orig_docs_cat, \
                cats_docsWords, cats_docNames, topN_cats
    
Ejemplo n.º 54
0
 def getWordsFromReutersDoc(self, doc):
     return self.getWords(reuters.raw(doc))
Ejemplo n.º 55
0
from nltk.corpus import gutenberg, abc, reuters, brown, movie_reviews
from topia.termextract import extract
extractor = extract.TermExtractor()

with open('./corpus/all3.txt', 'r') as f:
	with open('./data/terms.txt', 'w') as o:
		o.write("Term\tOccurences\tStrength\n")
		for term in extractor(f.read()+gutenberg.raw()+abc.raw()+reuters.raw()+brown.raw()+movie_reviews.raw()):
			o.write("\t".join(map(str, term)) + "\n")
from nltk.corpus import inaugural, reuters, brown, gutenberg

from itertools import product as iter_product

def words(text):
    return re.findall('[a-z]+', text.lower())


def train(features):
    model = collections.defaultdict(lambda: 1)
    for f in features:
        model[f] += 1
    return model

NWORDS = train(words(inaugural.raw() + reuters.raw() + brown.raw() + gutenberg.raw()))

alphabet = 'abcdefghijklmnopqrstuvwxyz'


def edits1(word):
    splits     = [(word[:i], word[i:]) for i in range(len(word) + 1)]
    deletes    = [a + b[1:] for a, b in splits if b]
    transposes = [a + b[1] + b[0] + b[2:] for a, b in splits if len(b)>1]
    replaces   = [a + c + b[1:] for a, b in splits for c in alphabet if b]
    inserts    = [a + c + b     for a, b in splits for c in alphabet]
    return set(deletes + transposes + replaces + inserts)


def known_edits2(word):
    return set(e2 for e1 in edits1(word) for e2 in edits1(e1) if e2 in NWORDS)
Ejemplo n.º 57
0
def reutersdocs(doc_index):
    """ Generator of docs from corpus reuters."""
    for doc_type_id in reuters.fileids():
        _, id = doc_type_id.split('/')
        doc_index.append(id)
        yield reuters.raw(doc_type_id)
Ejemplo n.º 58
0
    def get_tf_idf(self,sample_doc,corpus_list):
        for tip in corpus_list:
            
            tokens =  nltk.word_tokenize(reuters.raw(tip))

            
            bi_tokens = nltk.bigrams(tokens)
            tri_tokens = nltk.trigrams(tokens)
            
            tokens = [token.lower() for token in tokens if len(token) > 2]
            tokens = [token for token in tokens if token not in self.stopwords]
            
            bi_tokens = [' '.join(token).lower() for token in bi_tokens]
            bi_tokens = [token for token in bi_tokens if token not in self.stopwords]
         
            tri_tokens = [' '.join(token).lower() for token in tri_tokens]
            tri_tokens = [token for token in tri_tokens if token not in self.stopwords]
         
            final_tokens = []
            final_tokens.extend(tokens)
            final_tokens.extend(bi_tokens)
            final_tokens.extend(tri_tokens)
            self.docs[tip] = {'freq': {}, 'tf': {}, 'idf': {},
                                'tf-idf': {}, 'tokens': []}
         
            for token in final_tokens:
                #The frequency computed for each tip
                self.docs[tip]['freq'][token] = self.freq(token, final_tokens)
                #The term-frequency (Normalized Frequency)
                self.docs[tip]['tf'][token] = self.tf(token, final_tokens)
                
                self.docs[tip]['tokens'] = final_tokens
     
            self.vocabulary.append(final_tokens)
            
            print 'vocabulary size is {0}'.format(len(self.vocabulary));
            #print 'haha'
        #print self.vocabulary       
        
        x = 0;     
        for doc in self.docs:
            for token in self.docs[doc]['tf']:
                #The Inverse-Document-Frequency
                self.docs[doc]['idf'][token] = self.idf(token, self.vocabulary)
                #The tf-idf
                self.docs[doc]['tf-idf'][token] = self.tf_idf(token, self.docs[doc]['tokens'], self.vocabulary)
                #x = x + 1
            print 'Current iteration is {0}'.format(x)
        
        words  = {};
        for doc in self.docs:
            for token in self.docs[doc]['tf-idf']:
                if token not in words:
                    words[token] = self.docs[doc]['tf-idf'][token]
                else:
                    if self.docs[doc]['tf-idf'][token] > words[token]:
                        words[token] = self.docs[doc]['tf-idf'][token]
         
            
            for token in self.docs[doc]['tf-idf']:
                print token, self.docs[doc]['tf-idf'][token]
                
        for item in sorted(words.items(), key=lambda x: x[1], reverse=True):
            print "%f <= %s" % (item[1], item[0])    
from nltk import word_tokenize
from nltk.corpus import reuters
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

nltk.download('reuters')
nltk.download('punkt')

google_news_word2vec_model_location = 'data/GoogleNews-vectors-negative300.bin.gz'
doc2vec_model_location = 'model/doc2vec-model.bin'
doc2vec_vectors_location = 'model/doc2vec-vectors.bin'
doc2vec_dimensions = 300

doc2vec = Doc2Vec.load(doc2vec_model_location)

jobs = [{'category': 'jobs', 'vec': doc2vec.infer_vector(word_tokenize(reuters.raw(fileId)))} for fileId in reuters.fileids(['jobs'])]
trade = [{'category': 'trade', 'vec': doc2vec.infer_vector(word_tokenize(reuters.raw(fileId)))} for fileId in reuters.fileids(['trade'])[:500]]

docs = [doc for doc in itertools.chain(jobs, trade)]

pca = PCA(n_components=50)
fiftyDimVecs = pca.fit_transform([doc['vec'] for doc in docs])
tsne = TSNE(n_components=2)
twoDimVecs = tsne.fit_transform(fiftyDimVecs)

fig, ax = plt.subplots()
for doc, twoDimVec in zip(docs, twoDimVecs):
    ax.scatter(twoDimVec[0], twoDimVec[1], color=('r' if doc['category'] == 'jobs' else 'b'))
plt.show()

Ejemplo n.º 60
0
def load_data(padding=0, sent_len=300, w2i=None):
    
    """
        threshold = 0  all labels in test data
        threshold = 1  only multilabels in test data
    """
    threshold = 1 
    
    train_docs, train_cats, test_docs, test_cats = [], [], [], []
    
    popular_topics = set(['earn','acq','money-fx','grain','crude','trade','interest','ship','wheat','corn'])
    
    for doc_id in reuters.fileids():
        if doc_id.startswith("train"):
            if set(reuters.categories(doc_id)).issubset(popular_topics):
                train_docs.append(reuters.raw(doc_id))
                train_cats.append([cat for cat in reuters.categories(doc_id)])
#            train_cats.append(
#                [cats.index(cat) for cat in reuters.categories(doc_id)])
        else:
            if set(reuters.categories(doc_id)).issubset(popular_topics):
                test_docs.append(reuters.raw(doc_id))
                test_cats.append([cat for cat in reuters.categories(doc_id)])
    
    dataset = train_docs + test_docs
    max_sent_len, word_to_idx = get_vocab(dataset)
    if sent_len > 0:
        max_sent_len = sent_len      
    if w2i is not None:
        word_to_idx = w2i    

    train, train_label, test, test_label = [], [], [], []
        
    for i, line in enumerate(train_docs):
        words = line_to_words(line)
        y = train_cats[i]
        if len(y) > 1: # The examples which contain at least 1 label would be assigned to test data.
            test_docs.append(line)
            test_cats.append(y)
            continue           
        y = y[0]
        sent = [word_to_idx[word] for word in words if word in word_to_idx]
        if len(sent) > max_sent_len:
            sent = sent[:max_sent_len]
        else:    
            sent.extend([0] * (max_sent_len + padding - len(sent)))
        train.append(sent)
        train_label.append(y)
    
    single_label = ['-1'] + list(set(train_label))
    num_classes = len(single_label)
    for i, l in enumerate(train_label):
        train_label[i] = single_label.index(l)
        
    for i, line in enumerate(test_docs):
        words = line_to_words(line)
        y = test_cats[i]    
        sent = [word_to_idx[word] for word in words if word in word_to_idx]
        if len(sent) > max_sent_len:
            sent = sent[:max_sent_len]
        else:    
            sent.extend([0] * (max_sent_len + padding - len(sent)))
        if len(y) > threshold and set(y).issubset(single_label):
            test.append(sent)
            one_hot_y = np.zeros([num_classes],dtype=np.int32)
            for yi in y:
                one_hot_y[single_label.index(yi)]=1 
            test_label.append(one_hot_y)
        
    return single_label, word_to_idx, np.array(train), np.array(train_label), np.array(test), np.array(test_label)