def Reuters_task_generator(dataset_home='../datasets/Reuters', tfidf=True): print('fetching Reuters') reuters = fetch_reuters21579(dataset_home=dataset_home, dopickle=True) for order, task in enumerate(['OrgsPlaces', 'OrgsPeople', 'PeoplePlaces']): Xs, ys = reuters[task]['src'] Xt, yt = reuters[task]['tar'] assert Xs.shape[1] == Xt.shape[1], 'wrong number of columns' if tfidf: tfidf = TfidfTransformer(sublinear_tf=True) Xs = tfidf.fit_transform(Xs) Xt = tfidf.transform(Xt) fake_vocab = {'f%d' % i: i for i in range(Xs.shape[1])} source = Domain(Xs, ys, Xs, fake_vocab, task + '_source') target = Domain(Xt, yt, Xt, fake_vocab, task + '_target') print('X.shape={}, y-prevalence={:.3f}'.format(source.X.shape, source.y.mean())) print('X.shape={}, y-prevalence={:.3f}'.format(target.X.shape, target.y.mean())) yield source, target, '{}. {}'.format(order, task)
def _domain_from_usenet(news, positive, negative, domain_name, max_documents=None, tfidf=True): pos_docs = list(chain(*[news[l] for l in positive])) neg_docs = list(chain(*[news[l] for l in negative])) if max_documents is not None: pos_docs = pos_docs[:int(max_documents / 2)] neg_docs = neg_docs[:max_documents - len(pos_docs)] all_docs = pos_docs + neg_docs if tfidf: vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=3, strip_accents='unicode') else: vectorizer = CountVectorizer(min_df=3, strip_accents='unicode') X = vectorizer.fit_transform(all_docs) y = np.array([1] * len(pos_docs) + [0] * len(neg_docs)) V = vectorizer.vocabulary_ print('X.shape={}, y-prevalence={:.3f}'.format(X.shape, y.mean())) return Domain(X, y, X, V, domain_name)
def as_domain(labeled_docs, labels, unlabeled_docs, issource, domain, translations=None, language='en', tokken_pattern=r"(?u)\b\w\w+\b", min_df=1): """ Represents raw documents as a Domain; a domain contains the tfidf weighted co-occurrence matrices of the labeled and unlabeled documents (with consistent Vocabulary). :param labeled_docs: the set of labeled documents :param labels: the labels of labeled_docs :param unlabeled_docs: the set of unlabeled documents :param issource: boolean, if True then the vocabulary is bounded to the labeled documents (the training set), if otherwise, then the vocabulary has to be bounded to that of the unlabeled set (which is expecteldy bigger) since we should assume the test set is only seen during evaluation. This is not true in a Transductive setting, but we force it to follow the same setting so as to allow for a fair evaluation. :param domain: the name of the domain (e.g., 'books' :param language: the language of the domain (e.g., 'french') :param tokken_pattern: the token pattern the sklearn vectorizer will use to split words :param min_df: the minimum frequency below which words will be filtered out from the vocabulary :return: an instance of Domain """ if issource: counter = CountVectorizer(token_pattern=tokken_pattern, min_df=min_df) v = counter.fit(labeled_docs).vocabulary_ tfidf = TfidfVectorizer(sublinear_tf=True, token_pattern=tokken_pattern, vocabulary=v) else: tfidf = TfidfVectorizer(sublinear_tf=True, token_pattern=tokken_pattern, min_df=min_df) U = tfidf.fit_transform(unlabeled_docs) X = tfidf.transform(labeled_docs) y = np.array(labels) V = tfidf.vocabulary_ domain = Domain(X, y, U, V, domain, language) if translations is not None: T = tfidf.transform(translations) return domain, T else: return domain
def WebisCLS10_Upper_task_generator(dataset_home='../datasets/Webis-CLS-10'): """ Generates the upper bound tasks for cross-lingual experiments in Webis-CLS-10 dataset :param dataset_home: the path where to store the dataset :return: yields domain tasks in the typical order of appaerance of most papers """ print('fetching Webis-CLS-10') documents, translations, dictionaries = fetch_Webis_cls_10( dataset_home=dataset_home, skip_translations=True, dopickle=True) patt = r"(?u)\b\w+\b" # japanese may contain words which are ony one symbol taskno = 0 for target_lan in ['de', 'fr', 'jp']: for domain in ['books', 'dvd', 'music']: print('Loading Webis-CLS-10 task ' + '{}{}-{}{}'.format( target_lan, domain, target_lan, domain).upper()) tr_t_docs, tr_t_labels = list( zip(*documents[target_lan][domain]['train.processed'])) te_t_docs, te_t_labels = list( zip(*documents[target_lan][domain]['test.processed'])) tfidf = TfidfVectorizer(sublinear_tf=True, token_pattern=patt, min_df=1) Xtr = tfidf.fit_transform(tr_t_docs) Xte = tfidf.transform(te_t_docs) ytr = np.array(tr_t_labels) yte = np.array(te_t_labels) V = tfidf.vocabulary_ train = Domain(Xtr, ytr, None, V, domain, target_lan) test = Domain(Xte, yte, None, V, domain, target_lan) print("source: X={}".format(train.X.shape)) print("target: X={}".format(test.X.shape)) taskname = '{}. {} {}'.format(taskno, train.name(), test.name()) taskno += 1 yield train, test, taskname
def WebisCLS10_task_generator(dataset_home='../datasets/Webis-CLS-10', skip_translations=True): """ Generates the tasks for cross-lingual experiments in Webis-CLS-10 dataset :param dataset_home: the path where to store the dataset :return: yields tasks (source domain, target domain, and source-to-target oracle), in the typical order of appaerance of most papers """ print('fetching Webis-CLS-10') documents, translations, dictionaries = fetch_Webis_cls_10( dataset_home=dataset_home, skip_translations=skip_translations, dopickle=True) patt = r"(?u)\b\w+\b" # japanese may contain words which are ony one symbol source_lan = 'en' taskno = 0 for target_lan in ['de', 'fr', 'jp']: for domain in ['books', 'dvd', 'music']: print('Loading Webis-CLS-10 task ' + '{}{}-{}{}'.format( source_lan, domain, target_lan, domain).upper()) tr_s_docs, tr_s_labels = list( zip(*documents[source_lan][domain]['train.processed'])) unlabel_s_docs, _ = list( zip(*documents[source_lan][domain]['unlabeled.processed'])) if not skip_translations: transl_t_docs, transl_t_labels = list( zip(*translations[target_lan][domain]['test.processed'])) source, T = as_domain(tr_s_docs, tr_s_labels, unlabel_s_docs, issource=True, translations=transl_t_docs, domain=domain, language=source_lan, tokken_pattern=patt, min_df=1) Ty = np.array(transl_t_labels) else: source = as_domain(tr_s_docs, tr_s_labels, unlabel_s_docs, issource=True, translations=None, domain=domain, language=source_lan, tokken_pattern=patt, min_df=1) te_t_docs, te_t_labels = list( zip(*documents[target_lan][domain]['test.processed'])) unlabel_t_docs, _ = list( zip(*documents[target_lan][domain]['unlabeled.processed'])) target = as_domain(te_t_docs, te_t_labels, unlabel_t_docs, issource=False, domain=domain, language=target_lan, tokken_pattern=patt, min_df=3) oracle = WordOracle( dictionaries['{}_{}_dict.txt'.format(source_lan, target_lan)], source_lan, target_lan, analyzer=CountVectorizer(token_pattern=patt).build_analyzer()) print("source: X={} U={}".format(source.X.shape, source.U.shape)) print("target: X={} U={}".format(target.X.shape, target.U.shape)) taskname = '{}. {} {}'.format(taskno, source.name(), target.name()) taskno += 1 if skip_translations: yield source, target, oracle, taskname else: target_translations = Domain(T, Ty, None, source.V, domain, language='en') yield source, target, target_translations, oracle, taskname