コード例 #1
0
def intersec_models(modellist, intersec_vocab):
    for model in modellist[1:]:
        _, _ = intersection_align_gensim(m1=modellist[0],
                                         m2=model,
                                         words=intersec_vocab)

    return modellist
コード例 #2
0
    def __init__(self, w2v1, w2v2, assume_vocabs_are_identical=False):
        if not assume_vocabs_are_identical:
            w2v1, w2v2 = intersection_align_gensim(copy.copy(w2v1),
                                                   copy.copy(w2v2))

        self.w2v1 = w2v1
        self.w2v2 = w2v2
コード例 #3
0
def sims_aligned(year, word, *args):
    all_most_sim = []
    wrd_vectors = []

    def unite_sims(m):
        for x in m:
            all_most_sim.append(x[0])

    for pair in combinations(args, 2):
        _, _ = intersection_align_gensim(pair[0], pair[1])
        _ = smart_procrustes_align_gensim(pair[0], pair[1])

    for i in range(len(args)):
        unite_sims(args[i].most_similar(word, topn=7))
        if i != 0:
            wrd_vectors.append(args[i][word])

    return all_most_sim, wrd_vectors, args[0], word, year
コード例 #4
0
ファイル: sampling.py プロジェクト: wadimiusz/diachrony
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--models',
                        nargs='+',
                        help='paths to models to compare pairwise')
    parser.add_argument(
        '--pos-tag',
        default=None,
        type=str,
        help='specify this to remove words with other pos tags',
        dest='pos_tag')
    parser.add_argument(
        '--top-n-most-frequent-words',
        default=None,
        type=int,
        help=
        'you can specify n so that both positive and negative samples are from '
        'top n most frequent words',
        dest="top_n_most_frequent_words")
    parser.add_argument('--positive-samples',
                        type=int,
                        default=10,
                        help='words that have changed most',
                        dest='positive_samples')
    parser.add_argument('--negative-samples',
                        type=int,
                        default=10,
                        help='randomly samples words',
                        dest='negative_samples')
    parser.add_argument('--shuffle',
                        action='store_true',
                        help='use this argument to shuffle the output')
    parser.add_argument('--savefig',
                        type=str,
                        default=None,
                        help='if specified, this program will plot '
                        'the distributions of positive and negative samples'
                        'and store them in the path specified')
    args = parser.parse_args()

    samples = list()
    labels = list()
    base_year = list()
    ratings = list()

    pos_counts = list()
    neg_counts = list()

    for num, (model1_path,
              model2_path) in enumerate(zip(args.models, args.models[1:])):
        log("{num} / {total} {model1} {model2}".format(num=num,
                                                       total=len(args.models) -
                                                       1,
                                                       model1=model1_path,
                                                       model2=model2_path),
            end='\r')

        log('Done')
        model1 = load_model(model1_path)
        model2 = load_model(model2_path)
        model1, model2 = intersection_align_gensim(
            model1,
            model2,
            pos_tag=args.pos_tag,
            top_n_most_frequent_words=args.top_n_most_frequent_words)
        global_anchors_result = GlobalAnchors.get_changes(
            model1, model2, args.positive_samples)
        positive_samples = [word for (word, score) in global_anchors_result]
        possible_negative_samples = set(
            model1.wv.vocab.keys()) - set(positive_samples)
        negative_samples = list()
        for positive_sample in positive_samples:
            number_of_bin = get_bin(positive_sample, model1.vocab)
            eligible_negative_samples = [
                word for word in possible_negative_samples
                if get_bin(word, model1.vocab) == number_of_bin
            ]

            negative_sample = random.choice(eligible_negative_samples)
            negative_samples.append(negative_sample)
            import sys
            print("Positive sample:", positive_sample, file=sys.stderr)
            print("Negative sample:", negative_sample, file=sys.stderr)
            print("Positive bin:",
                  str(get_bin(positive_sample, model1.vocab)),
                  file=sys.stderr)
            print("Negative bin:",
                  str(get_bin(negative_sample, model1.vocab)),
                  file=sys.stderr)
            print("Positive percentile:",
                  str(get_percentile(positive_sample, model1.vocab)),
                  file=sys.stderr)
            print("Negative percentile:",
                  str(get_percentile(negative_sample, model1.vocab)),
                  file=sys.stderr)
            print("Positive count:",
                  str(get_count(positive_sample, model1.vocab)),
                  file=sys.stderr)
            print("Negative count:",
                  str(get_count(negative_sample, model1.vocab)),
                  file=sys.stderr)
            print("==============================================",
                  file=sys.stderr)

        samples.extend(positive_samples)
        samples.extend(negative_samples)

        labels.extend([1] * args.positive_samples)
        labels.extend([0] * args.negative_samples)

        pos_counts.extend(
            [get_count(word, model1.vocab) for word in positive_samples])
        neg_counts.extend(
            [get_count(word, model1.vocab) for word in negative_samples])

        ratings.extend(range(1, args.positive_samples + 1))
        ratings.extend([-1] * args.negative_samples)

        if model1_path.startswith('wordvectors/') and model1_path.endswith(
                '.model'):
            year = int(model1_path[len('wordvectors/'):-len('.model')])
        else:
            raise ValueError(
                "Pattern of {path} is not recognized. Path to model must start with 'wordvectors/'"
                " and end with '.model'. Feel free to change the pattern in the source code."
                .format(path=model1_path))

        base_year.extend([year] *
                         (args.positive_samples + args.negative_samples))

    output = pd.DataFrame({
        "WORD": samples,
        "LABEL": labels,
        'ASSESSOR_LABEL': -1,
        'BASE_YEAR': base_year,
        "RATING": ratings
    })

    if args.shuffle:
        output = output.sample(frac=1).reset_index(
            drop=True)  # this shuffles the dataframe but not its index

    output.index.names = ['ID']
    print(output.to_csv())

    with open('neg_counts.txt', 'w') as f:
        print(neg_counts, file=f)

    with open('pos_counts.txt', 'w') as f:
        print(pos_counts, file=f)

    if args.savefig is not None:
        upper = np.quantile(pos_counts, 0.99)
        plt.hist(pos_counts,
                 alpha=0.5,
                 bins=10,
                 label='positive',
                 range=(0, upper))
        plt.hist(neg_counts,
                 alpha=0.5,
                 bins=10,
                 label='negative',
                 range=(0, upper))
        plt.legend(loc='upper right')
        plt.savefig(args.savefig)
コード例 #5
0
 def __init__(self, w2v1: gensim.models.KeyedVectors, w2v2: gensim.models.KeyedVectors):
     self.w2v1, self.w2v2 = intersection_align_gensim(w2v1, w2v2)
     self.w2v2_changed = smart_procrustes_align_gensim(w2v1, w2v2)
コード例 #6
0
ファイル: main.py プロジェクト: wadimiusz/diachrony
def comparison(w2v1_path: str, w2v2_path: str, top_n_neighbors: int,
               top_n_changed_words: (int, None),
               top_n_most_frequent_words: (int, None), pos_tag: (str, None),
               informative: bool):
    """
    This module extracts two models from two specified paths and compares the meanings of words within their vocabulary.
    :param w2v1_path: the path to the first model
    :param w2v2_path: the path to the second modet
    :param top_n_neighbors: we will compare top n neighbors of words
    :param top_n_changed_words: we will output top n most interesting words, may be int or None
    :param top_n_most_frequent_words: we will use top n most frequent words from each model, may be int or None
    :param pos_tag: specify this to consider only words with a specific pos_tag
    :param informative: if True we use informative_output for printing output (more verbose and interpretable)
    :return: None
    """
    w2v1 = load_model(w2v1_path)
    w2v2 = load_model(w2v2_path)

    log("The first model contains {words1} words, e. g. {word1}\n"
        "The second model contains {words2} words, e. g. {word2}".format(
            words1=len(w2v1.wv.vocab),
            words2=len(w2v2.wv.vocab),
            word1=random.choice(list(w2v1.wv.vocab.keys())),
            word2=random.choice(list(w2v2.wv.vocab.keys()))))

    w2v1, w2v2 = intersection_align_gensim(
        w2v1,
        w2v2,
        pos_tag=pos_tag,
        top_n_most_frequent_words=top_n_most_frequent_words)

    log("After preprocessing, the first model contains {words1} words, e. g. {word1}\n"
        "The second model contains {words2} words, e. g. {word2}".format(
            words1=len(w2v1.wv.vocab),
            words2=len(w2v2.wv.vocab),
            word1=random.choice(list(w2v1.wv.vocab.keys())),
            word2=random.choice(list(w2v2.wv.vocab.keys()))))

    jaccard_result = Jaccard(w2v1=w2v1,
                             w2v2=w2v2,
                             top_n_neighbors=top_n_neighbors).get_changes(
                                 top_n_changed_words=top_n_changed_words)

    kendalltau_result = KendallTau(
        w2v1=w2v1, w2v2=w2v2, top_n_neighbors=top_n_neighbors).get_changes(
            top_n_changed_words=top_n_changed_words)

    procrustes_result = ProcrustesAligner(
        w2v1=w2v1,
        w2v2=w2v2).get_changes(top_n_changed_words=top_n_changed_words)

    global_anchors_result = GlobalAnchors(
        w2v1=w2v1,
        w2v2=w2v2).get_changes(top_n_changed_words=top_n_changed_words)

    results = (jaccard_result, kendalltau_result, procrustes_result,
               global_anchors_result)
    names = ('JACCARD', 'KENDALL TAU', 'PROCRUSTES', 'GLOBAL ANCHORS')

    if informative:
        for result, name in zip(results, names):
            informative_output(result, w2v1, w2v2, top_n_neighbors, name)
    else:
        for result, name in zip(results, names):
            simple_output(result, name)
コード例 #7
0
    def intersec_models(self, modeldict, intersec_vocab):
        for year, model in modeldict.items():
            if year != 2015:
                _, _ = intersection_align_gensim(m1=modeldict.get(2015), m2=model, words=intersec_vocab)

        return modeldict