Ejemplo n.º 1
0
def _brown_featurise(clusters_by_size, size, nodes, graph, focus):
    # TODO: This is not a particularily pretty way to handle the readers
    global READERS
    if 'BROWN_READERS' not in READERS:
        READERS['BROWN_READERS'] = defaultdict(dict)
    BROWN_READERS = READERS['BROWN_READERS']
    reader_key = ''.join(str(k) for k in clusters_by_size)
    try:
        reader = BROWN_READERS[reader_key][size]
    except KeyError:
        with open(clusters_by_size[size], 'r') as brown_file:
            reader = BrownReader(l.rstrip('\n') for l in brown_file)
        BROWN_READERS[reader_key][size] = reader

    # XXX: TODO: Limited to three steps
    for _, lbl_path, node in chain(
            graph.walk(focus, SeqLblSearch(('PRV', 'PRV', 'PRV'))),
            graph.walk(focus, SeqLblSearch(('NXT', 'NXT', 'NXT')))):
        try:
            brown_cluster = reader[node.value]
            for brown_gram in BROWN_GRAMS:
                if len(brown_cluster) < brown_gram:
                    # Don't overgenerate if we don't have enough grams
                    break
                f_name = 'BROWN-{0}-{1}-{2}'.format(size, '-'.join(lbl_path),
                                                    brown_cluster)
                yield f_name, 1.0
        except KeyError:
            # Only generate if we actually have an entry in the cluster
            pass
Ejemplo n.º 2
0
def _comp_featurise(nodes, graph, focus):
    # XXX: TODO: Limited to three steps

    for _, lbl_path, node in chain(
            graph.walk(focus, SeqLblSearch(('PRV', 'PRV', 'PRV'))),
            graph.walk(focus, SeqLblSearch(('NXT', 'NXT', 'NXT')))):
        f_name = 'WEIGHTED-POSITIONAL-{0}-{1}'.format('-'.join(lbl_path),
                                                      node.value)
        f_val = 1.0 / (2**(len(lbl_path) - 1))
        yield f_name, f_val

    # Token grams
    for gram_size in (3, ):
        for tok_gram in nwise((n.value for n in nodes), gram_size):
            yield 'TOK-GRAM-{0}-{1}'.format(gram_size, '-'.join(tok_gram)), 1.0
Ejemplo n.º 3
0
def _david_featurise(nodes, graph, focus):
    global DAVID_READER
    if DAVID_READER is None:
        from config import DAVID_CLUSTERS_PATH
        with open(DAVID_CLUSTERS_PATH, 'r') as david_file:
            DAVID_READER = DavidReader(l.rstrip('\n') for l in david_file)

    # XXX: TODO: Limited to three steps
    for _, lbl_path, node in chain(
            graph.walk(focus, SeqLblSearch(('PRV', 'PRV', 'PRV'))),
            graph.walk(focus, SeqLblSearch(('NXT', 'NXT', 'NXT')))):
        try:
            david_cluster = DAVID_READER[node.value]
            f_name = 'DAVID-{0}-{1}'.format('-'.join(lbl_path), david_cluster)
            yield f_name, 1.0
        except KeyError:
            # Only generate if we actually have an entry in the cluster
            pass
Ejemplo n.º 4
0
def _google_featurise(nodes, graph, focus):
    global GOOGLE_READER
    if GOOGLE_READER is None:
        from config import PHRASE_CLUSTERS_PATH
        with open(PHRASE_CLUSTERS_PATH, 'r') as google_file:
            GOOGLE_READER = GoogleReader(l.rstrip('\n') for l in google_file)

    for _, lbl_path, node in chain(
            graph.walk(focus, SeqLblSearch(('PRV', 'PRV', 'PRV'))),
            graph.walk(focus, SeqLblSearch(('NXT', 'NXT', 'NXT')))):
        try:
            distance_by_cluster = GOOGLE_READER[node.value]
            for cluster, distance in distance_by_cluster.iteritems():
                f_name = 'GOOGLE-{0}-{1}'.format('-'.join(lbl_path), cluster)
                yield f_name, distance
        except KeyError:
            # Only generate if we actually have an entry in the cluster
            pass
Ejemplo n.º 5
0
def _tsv_featurise(wordrepr_path, separator, wordrepr_name, reader_id, nodes,
                   graph, focus):
    global READERS
    reader_key = wordrepr_name + '_' + reader_id + '_READER'
    if reader_key not in READERS:
        with open(wordrepr_path, 'r') as input_file:
            READERS[reader_key] = TsvReader(
                [l.rstrip('\n') for l in input_file], separator)
    CURRENT_READER = READERS[reader_key]

    for _, lbl_path, node in chain(
            graph.walk(focus, SeqLblSearch(('PRV', 'PRV', 'PRV'))),
            graph.walk(focus, SeqLblSearch(('NXT', 'NXT', 'NXT')))):
        try:
            vector_value = CURRENT_READER[node.value]
            for component, value in vector_value.iteritems():
                f_name = '{3}_{2}-{0}-{1}'.format('-'.join(lbl_path),
                                                  component, reader_id,
                                                  wordrepr_name)
                yield f_name, value
        except KeyError:
            # Only generate if we actually have an entry in the cluster
            pass
Ejemplo n.º 6
0
def _bow_featurise(nodes, graph, focus):
    # XXX: TODO: Limited to three steps
    for _, _, node in chain(
            graph.walk(focus, SeqLblSearch(('PRV', 'PRV', 'PRV'))),
            graph.walk(focus, SeqLblSearch(('NXT', 'NXT', 'NXT')))):
        yield 'BOW-{0}'.format(node.value), 1.0