Ejemplo n.º 1
0
def main():
    parser = argparse.ArgumentParser(
        description='Computes distances between word pairs.')
    parser.add_argument("--input",
                        "-i",
                        type=openfile,
                        metavar="FILE",
                        help='The input vector space.')
    parser.add_argument('--pairsfile',
                        '-w',
                        metavar='FILE',
                        type=openfile,
                        help='The list of tab separated word pairs.')
    parser.add_argument(
        'words',
        nargs='*',
        metavar='WORD',
        help=('Additional word pairs specified at the command line.  '
              'Every two specifies an additional word pair. Must be '
              'given an even number of words.'))
    parser.add_argument('--pos',
                        '-p',
                        action='store_true',
                        help='Marks that the word pairs are POS tagged.')
    parser.add_argument('--distance-metric',
                        '-d',
                        action='append',
                        choices=METRICS.keys(),
                        help='Distance metrics to use.')
    args = parser.parse_args()

    pairs = set()
    if args.pairsfile:
        pairs.update(read_pairs(args.pairsfile))

    if len(args.words) % 2 != 0:
        raise ValueError, "You need to specify an even number of pair words."

    if not args.distance_metric:
        args.distance_metric = ['cosine']

    pairs.update(zip(args.words[::2], args.words[1::2]))

    vecspace = read_vector_file(args.input)
    if not args.pos:
        # need to strip the POS from the targets
        vecspace = df_remove_pos(vecspace)

    distance_metric_names = args.distance_metric
    distance_metrics = [METRICS[name] for name in distance_metric_names]
    output_measures = calculate_distance_metrics(vecspace, pairs,
                                                 distance_metrics)
    output_measures.to_csv(sys.stdout, sep="\t", index=False)
Ejemplo n.º 2
0
def main():
    parser = argparse.ArgumentParser(
                description='Computes distances between word pairs.')
    parser.add_argument("--input", "-i", type=openfile,
                        metavar="FILE", help='The input vector space.')
    parser.add_argument('--pairsfile', '-w', metavar='FILE', type=openfile,
                        help='The list of tab separated word pairs.')
    parser.add_argument('words', nargs='*', metavar='WORD',
                        help=('Additional word pairs specified at the command line.  '
                              'Every two specifies an additional word pair. Must be '
                              'given an even number of words.'))
    parser.add_argument('--pos', '-p', action='store_true',
                        help='Marks that the word pairs are POS tagged.')
    parser.add_argument('--distance-metric', '-d', action='append',
                        choices=METRICS.keys(),
                        help='Distance metrics to use.')
    args = parser.parse_args()

    pairs = set()
    if args.pairsfile:
        pairs.update(read_pairs(args.pairsfile))

    if len(args.words) % 2 != 0:
        raise ValueError, "You need to specify an even number of pair words."

    if not args.distance_metric:
        args.distance_metric = ['cosine']

    pairs.update(zip(args.words[::2], args.words[1::2]))

    vecspace = read_vector_file(args.input)
    if not args.pos:
        # need to strip the POS from the targets
        vecspace = df_remove_pos(vecspace)

    distance_metric_names = args.distance_metric
    distance_metrics = [METRICS[name] for name in distance_metric_names]
    output_measures = calculate_distance_metrics(vecspace, pairs, distance_metrics)
    output_measures.to_csv(sys.stdout, sep="\t", index=False)
Ejemplo n.º 3
0
def main():
    parser = argparse.ArgumentParser(
                description='Computes correlations with compositionality ratings.')
    parser.add_argument('--input', '-i', action="append", type=openfile,
                        metavar="FILE", help='Input vector space.')
    parser.add_argument('--ratings', '-r', metavar='COMPFILE', type=openfile,
                        help='The compositionality ratings file.')
    parser.add_argument('--self', '-s', action="store_true",
                        help='Whether we should include self-comp ratings.')
    parser.add_argument('--no-tsv', '-T', action="store_true",
                        help="*Don't* output the TSV containing comp and model ratings.")
    parser.add_argument('--corrs', '-c', action="store_true",
                        help='Specifies whether correlations should be computed and outputed.')
    parser.add_argument('--pdf', '-p', metavar="FILE", default=None,
                        help='Output plots as a PDF to the given filename.')

    args = parser.parse_args()

    compratings = pd.read_table(args.ratings)
    if not args.self:
        compratings = compratings[compratings["compound"] != compratings["const"]]

    word_pairs = set(zip(compratings['compound'], compratings['const']))

    named_vector_spaces = [
        (basename(f.name), norm2_matrix(df_remove_pos(read_vector_file(f))))
        for f in args.input
    ]

    if len(named_vector_spaces) > 1:
        # need to do concatenation
        names, vses = zip(*named_vector_spaces)
        concat_space = pd.concat(vses, keys=names)
        named_vector_spaces.append(("<concat>", concat_space))

    # compute all the distances AND keep the different measures independently named
    distances = [
        cdm(vs, word_pairs, [DISTANCE_METRIC])
          .rename(columns={DISTANCE_METRIC.name: fn + ":" + DISTANCE_METRIC.name})
        for fn, vs in named_vector_spaces
    ]
    # now we need to join all the distance calculations:
    joined_measures = reduce(pd.merge, distances).rename(
                        columns={"left": "compound", "right": "const"})

    # finally join the similarity measures with the human ratings
    dm_and_comp = pd.merge(compratings, joined_measures)

    # output dm_and_comp unless the user specified not to
    if not args.no_tsv:
        dm_and_comp.to_csv(sys.stdout, index=False, sep="\t")

    # nicer output
    if not args.no_tsv and args.corrs:
        # let's compute our correlations
        print "\n" + "-" * 80 + "\n"

    # compute and output correlations if the user asked
    if args.corrs:
        corrs = correlations(dm_and_comp).to_csv(sys.stdout, index=False, sep="\t")

    # plot the measures if the user asked.
    if args.pdf:
        scatters(dm_and_comp, args.pdf)
Ejemplo n.º 4
0
def main():
    parser = argparse.ArgumentParser(
        description='Computes correlations with compositionality ratings.')
    parser.add_argument('--input',
                        '-i',
                        action="append",
                        type=openfile,
                        metavar="FILE",
                        help='Input vector space.')
    parser.add_argument('--ratings',
                        '-r',
                        metavar='COMPFILE',
                        type=openfile,
                        help='The compositionality ratings file.')
    parser.add_argument('--self',
                        '-s',
                        action="store_true",
                        help='Whether we should include self-comp ratings.')
    parser.add_argument(
        '--no-tsv',
        '-T',
        action="store_true",
        help="*Don't* output the TSV containing comp and model ratings.")
    parser.add_argument(
        '--corrs',
        '-c',
        action="store_true",
        help='Specifies whether correlations should be computed and outputed.')
    parser.add_argument('--pdf',
                        '-p',
                        metavar="FILE",
                        default=None,
                        help='Output plots as a PDF to the given filename.')

    args = parser.parse_args()

    compratings = pd.read_table(args.ratings)
    if not args.self:
        compratings = compratings[
            compratings["compound"] != compratings["const"]]

    word_pairs = set(zip(compratings['compound'], compratings['const']))

    named_vector_spaces = [(basename(f.name),
                            norm2_matrix(df_remove_pos(read_vector_file(f))))
                           for f in args.input]

    if len(named_vector_spaces) > 1:
        # need to do concatenation
        names, vses = zip(*named_vector_spaces)
        concat_space = pd.concat(vses, keys=names)
        named_vector_spaces.append(("<concat>", concat_space))

    # compute all the distances AND keep the different measures independently named
    distances = [
        cdm(vs, word_pairs, [DISTANCE_METRIC]).rename(
            columns={DISTANCE_METRIC.name: fn + ":" + DISTANCE_METRIC.name})
        for fn, vs in named_vector_spaces
    ]
    # now we need to join all the distance calculations:
    joined_measures = reduce(pd.merge, distances).rename(columns={
        "left": "compound",
        "right": "const"
    })

    # finally join the similarity measures with the human ratings
    dm_and_comp = pd.merge(compratings, joined_measures)

    # output dm_and_comp unless the user specified not to
    if not args.no_tsv:
        dm_and_comp.to_csv(sys.stdout, index=False, sep="\t")

    # nicer output
    if not args.no_tsv and args.corrs:
        # let's compute our correlations
        print "\n" + "-" * 80 + "\n"

    # compute and output correlations if the user asked
    if args.corrs:
        corrs = correlations(dm_and_comp).to_csv(sys.stdout,
                                                 index=False,
                                                 sep="\t")

    # plot the measures if the user asked.
    if args.pdf:
        scatters(dm_and_comp, args.pdf)