Ejemplo n.º 1
0
def create_table_code():
    code = ""

    for table in get_tables():
        table_code = "{tablename} = get_table('{tablename}')".format(tablename=table)
        code = joinstr([code, table_code, '\n'], "")

    return code
Ejemplo n.º 2
0
def create_code(dbtype):
    codes = [table_code(table) for table in inspector.get_table_names()]
    code = joinstr(codes, "")

    try:
        _dbtypes = sqldbtypes[dbtype]
    except:
        _dbtypes = ""

    #print "_dbtypes ", _dbtypes

    return header.format(dbtypes=_dbtypes) + code
Ejemplo n.º 3
0
def table_code(table):
    ##print "------table = ", table

    columns = inspector.get_columns(table)


    ##print "columns = ", columns
    ##print "--------------"
    #create_column_code(columns[0])   
    columns_code = [create_column_code(col, table) for col in columns]
    columns_code_ = joinstr(columns_code, '\n')
    ##print columns_code_
    return table_tpl.format(TableName=table, table=table, columns=columns_code_)
Ejemplo n.º 4
0
def create_column_code(col, table):
    fnkey_map = foreign_keys_map(table)

    ##print "fnkey_map = ", fnkey_map

    ##print "col = ", col

    bols = ["False", "True"]

    name = col['name']
    nullable = col['nullable']
    try:
        primary_key = col['primary_key']
    except:
        primary_key = 0

    default = col['default']

    #import IPython ; IPython.embed()

    if type(col['type']) == sqlalchemy.sql.sqltypes.NullType:
        _type = "NullType()"
    else:
        _type = str(col['type'])

    if _type == "TIMESTAMP WITHOUT TIME ZONE":
        _type = "DATETIME()"

    if not _type.endswith(')'):
        _type += '()'

    if name in fnkey_map.keys():
        ftable, fcolumn = fnkey_map[name]
        _foreign_keys = "ForeignKey(\"%s.%s\")" % (ftable, fcolumn)
        #print "_foreign_keys = ", _foreign_keys
    else:
        _foreign_keys = None

    code = [
        '\"%s\"' % name,
        _type,
        _foreign_keys,
        "primary_key=%s" % str(primary_key),
        "nullable=%s" % nullable,
        #"default=%s" % str(default),
    ]

    _code = joinstr(code, ", ")

    return column_tpl.format(colname=name, data=_code)
def generate_sents(output_name, group='dev'):

    outdir = '{}/ico_acceptor/{}/'.format(utils.DATA_DIR, output_name)
    try:
        os.mkdir(outdir)
    except OSError:
        print('Target dir: {} already exists'.format(outdir))

    sent_dir = '{}/documents/sents/'.format(utils.DATA_DIR)
    pmids = utils.group_ids(group)

    sample_c = open('{}/{}_sample_c.txt'.format(outdir, group), 'w')
    sample_o = open('{}/{}_sample_o.txt'.format(outdir, group), 'w')
    sample_ic = open('{}/{}_sample_ic.txt'.format(outdir, group), 'w')
    sample_co = open('{}/{}_sample_co.txt'.format(outdir, group), 'w')
    sample_none = open('{}/{}_sample_none.txt'.format(outdir, group), 'w')
    for pmid in pmids:

        all_ner_f = '{}/documents/txts/{}.ner_test'.format(
            utils.DATA_DIR, pmid)
        all_ner = json.loads(open(all_ner_f).read())
        sents = utils.readlines('{}/{}.sents'.format(sent_dir, pmid))
        ev_labels = [int(l) for l in \
            utils.readlines('{}/{}.bert_ev_binary'.format(sent_dir, pmid))]
        ners = [json.loads(l) for l in \
            open('{}/{}.ner_test'.format(sent_dir, pmid)).readlines()]

        frame_idx = 0

        for sent_idx, (s, ev, ner) in enumerate(zip(sents, ev_labels, ners)):
            if ev:
                if s == 'ABSTRACT ':  # data artifact due to text generation
                    continue

                n_i = len(ner['i'])
                n_o = len(ner['o'])

                if n_i >= 2:
                    i_pairs = combinations(ner['i'], 2)
                    i_pairs = [(i, c) for i, c in i_pairs if i != c]
                    if n_o >= 1:
                        for (i, c), o in product(i_pairs, ner['o']):
                            sample_none.write(
                                utils.joinstr(
                                    [pmid, sent_idx, frame_idx, i, c, o, s]))
                            sample_none.write('\n')
                            frame_idx += 1
                    else:  # n_o == 0
                        o_spans = all_ner['o']
                        for i, c in i_pairs:
                            for o in o_spans:
                                sample_o.write(
                                    utils.joinstr([
                                        pmid, sent_idx, frame_idx, i, c, o, s
                                    ]))
                                sample_o.write('\n')
                            frame_idx += 1

                elif n_i == 1:
                    i = ner['i'][0]
                    if n_o >= 1:
                        c_spans = all_ner['i']
                        for o in ner['o']:
                            for c in c_spans:
                                sample_c.write(
                                    utils.joinstr([
                                        pmid, sent_idx, frame_idx, i, c, o, s
                                    ]))
                                sample_c.write('\n')
                            frame_idx += 1
                    else:  # n_o == 0
                        c_spans = [c for c in all_ner['i'] if c != i]
                        o_spans = all_ner['o']
                        for c, o in product(c_spans, o_spans):
                            sample_co.write(
                                utils.joinstr(
                                    [pmid, sent_idx, frame_idx, i, c, o, s]))
                            sample_co.write('\n')
                        frame_idx += 1

                else:  # n_i == 0
                    if n_o >= 1:
                        ic_pairs = combinations(all_ner['i'], 2)
                        ic_pairs = [(i, c) for i, c in ic_pairs if i != c]
                        for o in ner['o']:
                            for i, c in ic_pairs:
                                sample_ic.write(
                                    utils.joinstr([
                                        pmid, sent_idx, frame_idx, i, c, o, s
                                    ]))
                                sample_ic.write('\n')
                            frame_idx += 1

                    else:  # n_o == 0
                        pass  # too hard! punt!
Ejemplo n.º 6
0
Compute the label frequency of a given input file and print to a csv format to output file
"""
from collections import defaultdict
from docopt import docopt
import logging
logging.basicConfig(level=logging.DEBUG)
import sys
sys.path.append("./common")
from utils import joinstr
import pandas

if __name__ == "__main__":
    args = docopt(__doc__)
    input_file = args["--in"]
    output_file = args["--out"]

    df = pandas.read_csv(input_file, sep='\t', header=0)
    labels_dic = dict([(label, df[df.label == label].shape[0])
                       for label in df.label.unique()])
    total = df.label.shape[0]

    # Print to output file
    with open(output_file, 'w') as fout:
        for label, count in sorted(labels_dic.iteritems(),
                                   key=lambda (k, v): v,
                                   reverse=True):
            fout.write(
                joinstr(',', [
                    label, count, '{:.3f}'.format((float(count) / total) * 100)
                ]) + '\n')
Ejemplo n.º 7
0
            if all([next_word.label == 'O' for (_, next_word)
                    in words[word.word_id + 1 : ]]):
                # TODO: this is very inefficient
                postfix = 'E'
            else:
                postfix = word.label.split("-")[0] # Borrow label from last seen tag
            new_label = word.label

        # Reform line, only label is possibly changed
        ret.append([word.word_id, word.word, word.pred, word.pred_id, word.sent_id, word.run_id,
                    new_label])
    return ret

if __name__ == "__main__":
    args = docopt(__doc__)
    input_file = args["--in"]
    output_file = args["--out"]

    # iterate over sentences
    df = pandas.read_csv(input_file, sep = '\t', header = 0)
    sents = [df[df.run_id == i] for i in range(min(df.run_id), max(df.run_id))]

    with open(output_file, 'w') as fout:
        # Write header
        fout.write(joinstr('\t', [k for k in df.keys()]))
        #Write sents:
        for sent in sents:
            for line in relabel(sent):
                fout.write(joinstr('\t', line))
            fout.write('\n')