Esempio n. 1
0
def Reuters_task_generator(dataset_home='../datasets/Reuters', tfidf=True):

    print('fetching Reuters')
    reuters = fetch_reuters21579(dataset_home=dataset_home, dopickle=True)

    for order, task in enumerate(['OrgsPlaces', 'OrgsPeople', 'PeoplePlaces']):
        Xs, ys = reuters[task]['src']
        Xt, yt = reuters[task]['tar']
        assert Xs.shape[1] == Xt.shape[1], 'wrong number of columns'

        if tfidf:
            tfidf = TfidfTransformer(sublinear_tf=True)
            Xs = tfidf.fit_transform(Xs)
            Xt = tfidf.transform(Xt)

        fake_vocab = {'f%d' % i: i for i in range(Xs.shape[1])}
        source = Domain(Xs, ys, Xs, fake_vocab, task + '_source')
        target = Domain(Xt, yt, Xt, fake_vocab, task + '_target')

        print('X.shape={}, y-prevalence={:.3f}'.format(source.X.shape,
                                                       source.y.mean()))
        print('X.shape={}, y-prevalence={:.3f}'.format(target.X.shape,
                                                       target.y.mean()))

        yield source, target, '{}. {}'.format(order, task)
Esempio n. 2
0
def _domain_from_usenet(news,
                        positive,
                        negative,
                        domain_name,
                        max_documents=None,
                        tfidf=True):
    pos_docs = list(chain(*[news[l] for l in positive]))
    neg_docs = list(chain(*[news[l] for l in negative]))

    if max_documents is not None:
        pos_docs = pos_docs[:int(max_documents / 2)]
        neg_docs = neg_docs[:max_documents - len(pos_docs)]

    all_docs = pos_docs + neg_docs
    if tfidf:
        vectorizer = TfidfVectorizer(sublinear_tf=True,
                                     min_df=3,
                                     strip_accents='unicode')
    else:
        vectorizer = CountVectorizer(min_df=3, strip_accents='unicode')
    X = vectorizer.fit_transform(all_docs)
    y = np.array([1] * len(pos_docs) + [0] * len(neg_docs))
    V = vectorizer.vocabulary_
    print('X.shape={}, y-prevalence={:.3f}'.format(X.shape, y.mean()))
    return Domain(X, y, X, V, domain_name)
Esempio n. 3
0
def as_domain(labeled_docs,
              labels,
              unlabeled_docs,
              issource,
              domain,
              translations=None,
              language='en',
              tokken_pattern=r"(?u)\b\w\w+\b",
              min_df=1):
    """
    Represents raw documents as a Domain; a domain contains the tfidf weighted co-occurrence matrices of the labeled
    and unlabeled documents (with consistent Vocabulary).
    :param labeled_docs: the set of labeled documents
    :param labels: the labels of labeled_docs
    :param unlabeled_docs: the set of unlabeled documents
    :param issource: boolean, if True then the vocabulary is bounded to the labeled documents (the training set), if
    otherwise, then the vocabulary has to be bounded to that of the unlabeled set (which is expecteldy bigger) since
    we should assume the test set is only seen during evaluation. This is not true in a Transductive setting, but we
    force it to follow the same setting so as to allow for a fair evaluation.
    :param domain: the name of the domain (e.g., 'books'
    :param language: the language of the domain (e.g., 'french')
    :param tokken_pattern: the token pattern the sklearn vectorizer will use to split words
    :param min_df: the minimum frequency below which words will be filtered out from the vocabulary
    :return: an instance of Domain
    """
    if issource:
        counter = CountVectorizer(token_pattern=tokken_pattern, min_df=min_df)
        v = counter.fit(labeled_docs).vocabulary_
        tfidf = TfidfVectorizer(sublinear_tf=True,
                                token_pattern=tokken_pattern,
                                vocabulary=v)
    else:
        tfidf = TfidfVectorizer(sublinear_tf=True,
                                token_pattern=tokken_pattern,
                                min_df=min_df)
    U = tfidf.fit_transform(unlabeled_docs)
    X = tfidf.transform(labeled_docs)
    y = np.array(labels)
    V = tfidf.vocabulary_
    domain = Domain(X, y, U, V, domain, language)
    if translations is not None:
        T = tfidf.transform(translations)
        return domain, T
    else:
        return domain
Esempio n. 4
0
def WebisCLS10_Upper_task_generator(dataset_home='../datasets/Webis-CLS-10'):
    """
    Generates the upper bound tasks for cross-lingual experiments in Webis-CLS-10 dataset
    :param dataset_home: the path where to store the dataset
    :return: yields domain tasks in the typical order of appaerance of most papers
    """
    print('fetching Webis-CLS-10')
    documents, translations, dictionaries = fetch_Webis_cls_10(
        dataset_home=dataset_home, skip_translations=True, dopickle=True)

    patt = r"(?u)\b\w+\b"  # japanese may contain words which are ony one symbol

    taskno = 0
    for target_lan in ['de', 'fr', 'jp']:
        for domain in ['books', 'dvd', 'music']:
            print('Loading Webis-CLS-10 task ' + '{}{}-{}{}'.format(
                target_lan, domain, target_lan, domain).upper())

            tr_t_docs, tr_t_labels = list(
                zip(*documents[target_lan][domain]['train.processed']))
            te_t_docs, te_t_labels = list(
                zip(*documents[target_lan][domain]['test.processed']))

            tfidf = TfidfVectorizer(sublinear_tf=True,
                                    token_pattern=patt,
                                    min_df=1)
            Xtr = tfidf.fit_transform(tr_t_docs)
            Xte = tfidf.transform(te_t_docs)
            ytr = np.array(tr_t_labels)
            yte = np.array(te_t_labels)
            V = tfidf.vocabulary_
            train = Domain(Xtr, ytr, None, V, domain, target_lan)
            test = Domain(Xte, yte, None, V, domain, target_lan)

            print("source: X={}".format(train.X.shape))
            print("target: X={}".format(test.X.shape))

            taskname = '{}. {} {}'.format(taskno, train.name(), test.name())
            taskno += 1
            yield train, test, taskname
Esempio n. 5
0
def WebisCLS10_task_generator(dataset_home='../datasets/Webis-CLS-10',
                              skip_translations=True):
    """
    Generates the tasks for cross-lingual experiments in Webis-CLS-10 dataset
    :param dataset_home: the path where to store the dataset
    :return: yields tasks (source domain, target domain, and source-to-target oracle), in the typical order of
    appaerance of most papers
    """
    print('fetching Webis-CLS-10')
    documents, translations, dictionaries = fetch_Webis_cls_10(
        dataset_home=dataset_home,
        skip_translations=skip_translations,
        dopickle=True)

    patt = r"(?u)\b\w+\b"  # japanese may contain words which are ony one symbol

    source_lan = 'en'
    taskno = 0
    for target_lan in ['de', 'fr', 'jp']:
        for domain in ['books', 'dvd', 'music']:
            print('Loading Webis-CLS-10 task ' + '{}{}-{}{}'.format(
                source_lan, domain, target_lan, domain).upper())

            tr_s_docs, tr_s_labels = list(
                zip(*documents[source_lan][domain]['train.processed']))
            unlabel_s_docs, _ = list(
                zip(*documents[source_lan][domain]['unlabeled.processed']))
            if not skip_translations:
                transl_t_docs, transl_t_labels = list(
                    zip(*translations[target_lan][domain]['test.processed']))
                source, T = as_domain(tr_s_docs,
                                      tr_s_labels,
                                      unlabel_s_docs,
                                      issource=True,
                                      translations=transl_t_docs,
                                      domain=domain,
                                      language=source_lan,
                                      tokken_pattern=patt,
                                      min_df=1)
                Ty = np.array(transl_t_labels)
            else:
                source = as_domain(tr_s_docs,
                                   tr_s_labels,
                                   unlabel_s_docs,
                                   issource=True,
                                   translations=None,
                                   domain=domain,
                                   language=source_lan,
                                   tokken_pattern=patt,
                                   min_df=1)

            te_t_docs, te_t_labels = list(
                zip(*documents[target_lan][domain]['test.processed']))
            unlabel_t_docs, _ = list(
                zip(*documents[target_lan][domain]['unlabeled.processed']))
            target = as_domain(te_t_docs,
                               te_t_labels,
                               unlabel_t_docs,
                               issource=False,
                               domain=domain,
                               language=target_lan,
                               tokken_pattern=patt,
                               min_df=3)

            oracle = WordOracle(
                dictionaries['{}_{}_dict.txt'.format(source_lan, target_lan)],
                source_lan,
                target_lan,
                analyzer=CountVectorizer(token_pattern=patt).build_analyzer())

            print("source: X={} U={}".format(source.X.shape, source.U.shape))
            print("target: X={} U={}".format(target.X.shape, target.U.shape))

            taskname = '{}. {} {}'.format(taskno, source.name(), target.name())
            taskno += 1
            if skip_translations:
                yield source, target, oracle, taskname
            else:
                target_translations = Domain(T,
                                             Ty,
                                             None,
                                             source.V,
                                             domain,
                                             language='en')
                yield source, target, target_translations, oracle, taskname