Esempio n. 1
0
def main():
    parser = argparse.ArgumentParser(
                description='Computes an LSA model correlating associations and lexsem model.')
    parser.add_argument('--lexspace', '-l', metavar='FILE',
                        help='The input lexical space.')
    parser.add_argument('--assoc', '-a', metavar='FILE',
                        help='The input association space.')
    args = parser.parse_args()

    lex = norm2_matrix(read_vector_file(openfile(args.lexspace)))
    assoc = norm2_matrix(read_vector_file(openfile(args.assoc)))
    together = pd.concat(lex, assoc, keys=("lex", "assoc"))

    org_matrix = together.as_matrix()
    U, S, V = svd(org_matrix)

    np.savez("svd.npz", U, S, V)
Esempio n. 2
0
def main():
    parser = argparse.ArgumentParser(
        description=
        'Computes an LSA model correlating associations and lexsem model.')
    parser.add_argument('--lexspace',
                        '-l',
                        metavar='FILE',
                        help='The input lexical space.')
    parser.add_argument('--assoc',
                        '-a',
                        metavar='FILE',
                        help='The input association space.')
    args = parser.parse_args()

    lex = norm2_matrix(read_vector_file(openfile(args.lexspace)))
    assoc = norm2_matrix(read_vector_file(openfile(args.assoc)))
    together = pd.concat(lex, assoc, keys=("lex", "assoc"))

    org_matrix = together.as_matrix()
    U, S, V = svd(org_matrix)

    np.savez("svd.npz", U, S, V)
Esempio n. 3
0
def main():
    parser = argparse.ArgumentParser(
                description='Computes correlations with compositionality ratings.')
    parser.add_argument('--input', '-i', action="append", type=openfile,
                        metavar="FILE", help='Input vector space.')
    parser.add_argument('--ratings', '-r', metavar='COMPFILE', type=openfile,
                        help='The compositionality ratings file.')
    parser.add_argument('--self', '-s', action="store_true",
                        help='Whether we should include self-comp ratings.')
    parser.add_argument('--no-tsv', '-T', action="store_true",
                        help="*Don't* output the TSV containing comp and model ratings.")
    parser.add_argument('--corrs', '-c', action="store_true",
                        help='Specifies whether correlations should be computed and outputed.')
    parser.add_argument('--pdf', '-p', metavar="FILE", default=None,
                        help='Output plots as a PDF to the given filename.')

    args = parser.parse_args()

    compratings = pd.read_table(args.ratings)
    if not args.self:
        compratings = compratings[compratings["compound"] != compratings["const"]]

    word_pairs = set(zip(compratings['compound'], compratings['const']))

    named_vector_spaces = [
        (basename(f.name), norm2_matrix(df_remove_pos(read_vector_file(f))))
        for f in args.input
    ]

    if len(named_vector_spaces) > 1:
        # need to do concatenation
        names, vses = zip(*named_vector_spaces)
        concat_space = pd.concat(vses, keys=names)
        named_vector_spaces.append(("<concat>", concat_space))

    # compute all the distances AND keep the different measures independently named
    distances = [
        cdm(vs, word_pairs, [DISTANCE_METRIC])
          .rename(columns={DISTANCE_METRIC.name: fn + ":" + DISTANCE_METRIC.name})
        for fn, vs in named_vector_spaces
    ]
    # now we need to join all the distance calculations:
    joined_measures = reduce(pd.merge, distances).rename(
                        columns={"left": "compound", "right": "const"})

    # finally join the similarity measures with the human ratings
    dm_and_comp = pd.merge(compratings, joined_measures)

    # output dm_and_comp unless the user specified not to
    if not args.no_tsv:
        dm_and_comp.to_csv(sys.stdout, index=False, sep="\t")

    # nicer output
    if not args.no_tsv and args.corrs:
        # let's compute our correlations
        print "\n" + "-" * 80 + "\n"

    # compute and output correlations if the user asked
    if args.corrs:
        corrs = correlations(dm_and_comp).to_csv(sys.stdout, index=False, sep="\t")

    # plot the measures if the user asked.
    if args.pdf:
        scatters(dm_and_comp, args.pdf)
Esempio n. 4
0
def main():
    parser = argparse.ArgumentParser(
        description='Computes correlations with compositionality ratings.')
    parser.add_argument('--input',
                        '-i',
                        action="append",
                        type=openfile,
                        metavar="FILE",
                        help='Input vector space.')
    parser.add_argument('--ratings',
                        '-r',
                        metavar='COMPFILE',
                        type=openfile,
                        help='The compositionality ratings file.')
    parser.add_argument('--self',
                        '-s',
                        action="store_true",
                        help='Whether we should include self-comp ratings.')
    parser.add_argument(
        '--no-tsv',
        '-T',
        action="store_true",
        help="*Don't* output the TSV containing comp and model ratings.")
    parser.add_argument(
        '--corrs',
        '-c',
        action="store_true",
        help='Specifies whether correlations should be computed and outputed.')
    parser.add_argument('--pdf',
                        '-p',
                        metavar="FILE",
                        default=None,
                        help='Output plots as a PDF to the given filename.')

    args = parser.parse_args()

    compratings = pd.read_table(args.ratings)
    if not args.self:
        compratings = compratings[
            compratings["compound"] != compratings["const"]]

    word_pairs = set(zip(compratings['compound'], compratings['const']))

    named_vector_spaces = [(basename(f.name),
                            norm2_matrix(df_remove_pos(read_vector_file(f))))
                           for f in args.input]

    if len(named_vector_spaces) > 1:
        # need to do concatenation
        names, vses = zip(*named_vector_spaces)
        concat_space = pd.concat(vses, keys=names)
        named_vector_spaces.append(("<concat>", concat_space))

    # compute all the distances AND keep the different measures independently named
    distances = [
        cdm(vs, word_pairs, [DISTANCE_METRIC]).rename(
            columns={DISTANCE_METRIC.name: fn + ":" + DISTANCE_METRIC.name})
        for fn, vs in named_vector_spaces
    ]
    # now we need to join all the distance calculations:
    joined_measures = reduce(pd.merge, distances).rename(columns={
        "left": "compound",
        "right": "const"
    })

    # finally join the similarity measures with the human ratings
    dm_and_comp = pd.merge(compratings, joined_measures)

    # output dm_and_comp unless the user specified not to
    if not args.no_tsv:
        dm_and_comp.to_csv(sys.stdout, index=False, sep="\t")

    # nicer output
    if not args.no_tsv and args.corrs:
        # let's compute our correlations
        print "\n" + "-" * 80 + "\n"

    # compute and output correlations if the user asked
    if args.corrs:
        corrs = correlations(dm_and_comp).to_csv(sys.stdout,
                                                 index=False,
                                                 sep="\t")

    # plot the measures if the user asked.
    if args.pdf:
        scatters(dm_and_comp, args.pdf)