Exemple #1
0
def load_prepare_keyword_sentence_data(datafile: str, limit=100):
    corpus_name = "sci-keyword-sentencs"
    data_g = data_io.read_jsons_from_file(datafile, limit=100)
    voc = Voc(corpus_name)
    [voc.addWord(char) for d in data_g for char in d['sentence']]
    voc.trim(10)
    assert '#' in voc.word2index.keys()
    print("Counted words:", voc.num_words)

    # def infinite_data_g():
    #     while True:
    #         for d in data_io.read_jsons_from_file(datafile):
    #             yield d

    def normalizeString(s):
        s = re.sub(r"\s+", r" ", s).strip()
        s = ''.join([
            c if c in voc.word2index else voc.index2word[UNK_token] for c in s
        ])
        return s

    pairs = [(normalizeString(' '.join(d['keywords'])),
              normalizeString(d['sentence']))
             for d in data_io.read_jsons_from_file(datafile, limit=limit)]
    print('got %d train-samples' % len(pairs))

    def batch2TrainData_fun(voc, pair_batch):
        return batch2TrainData(voc, pair_batch, lambda s: list(s))

    return voc, pairs, batch2TrainData_fun, normalizeString
Exemple #2
0
def getting_scierc_tagged_data(jsonl_file):
    def build_sentences(d: Dict):
        g = [tok for sent in d['sentences'] for tok in sent]
        spaced_tokens = [
            x for k, tok in enumerate(g) for x in [(tok, k), (' ', k + 0.5)]
        ]
        char_offsets = numpy.cumsum([0] + [len(x) for x, _ in spaced_tokens])
        tok2charoff = {
            k: char_offsets[i]
            for i, (tok, k) in enumerate(spaced_tokens)
        }
        text = ''.join([t for t, _ in spaced_tokens])

        spans = [(tok2charoff[token_start], tok2charoff[token_end + 1], label)
                 for sent_spans in d['ner']
                 for token_start, token_end, label in sent_spans]
        relations = [(tok2charoff[s1], tok2charoff[e1 + 1], tok2charoff[s2],
                      tok2charoff[e2 + 1], label)
                     for sent_rel in d['relations']
                     for s1, e1, s2, e2, label in sent_rel]
        return d['doc_key'], text, spans, relations

    return [
        build_sentences(d) for d in data_io.read_jsons_from_file(jsonl_file)
    ]
Exemple #3
0
def get_scierc_data_as_flair_sentences():
    # data_path = '/home/tilo/code/NLP/scisci_nlp/data/scierc_data/json/'
    data_path = '../data/scierc_data/json/'
    sentences = [sent for jsonl_file in ['train.json','dev.json','test.json']
                 for d in data_io.read_jsons_from_file('%s/%s' % (data_path,jsonl_file))
                 for sent in build_flair_sentences(d)]
    return sentences
Exemple #4
0
def get_data():
    # data_path = '/home/tilo/code/NLP/scisci_nlp/data/scierc_data/json'
    data_path = '/docker-share/data/scierc_data/json'
    # data_path = '/home/users/t/tilo-himmelsbach/data/scierc_data/json'
    sentences = [
        sent for jsonl_file in ['train.json', 'dev.json', 'test.json']
        for d in data_io.read_jsons_from_file('%s/%s' %
                                              (data_path, jsonl_file))
        for sent in build_flair_sentences(d)
    ]
    return sentences
Exemple #5
0
def load_embedded_data(path: str):
    raw_data = data_io.read_jsons_from_file(path + '/' + 'raw_batch_0.jsonl')
    tensor = torch.load(path + '/processed_batch_0.pt')
    print(tensor.shape)

    def add_key_value(d, k, v):
        d[k] = v
        return d

    return [(add_key_value(d, 'embedding', tensor[k]))
            for k, d in enumerate(raw_data)]
Exemple #6
0
import json

from commons import data_io

from sqlalchemy_util.sqlalchemy_base import get_sqlalchemy_base_engine
from sqlalchemy_util.sqlalchemy_methods import insert_or_update, get_tables_by_reflection

if __name__ == '__main__':
    # ip = 'localhost'
    ip = '10.1.1.29'
    sqlalchemy_base, sqlalchemy_engine = get_sqlalchemy_base_engine(host=ip)

    data = data_io.read_jsons_from_file(
        '/home/tilo/code/NLP/scisci_nlp/data/scierc_data/json/train.json')
    data = ({**{'id': json.dumps(d.pop('doc_key'))}, **d} for d in data)
    data = (d for d in data if isinstance(d['id'], str))
    table = get_tables_by_reflection(sqlalchemy_base.metadata,
                                     sqlalchemy_engine)['scierc']

    # columns = [Column('id', String, primary_key=True)] + [Column(colname, String) for colname in ['sentences','ner','relations','clusters']]
    # table = Table('scierc', sqlalchemy_base.metadata, *columns, extend_existing=True)

    table.drop(sqlalchemy_engine)
    if not sqlalchemy_engine.has_table(table.name):
        print('creating table %s' % table.name)
        table.create()

    def update_fun(val, old_row):
        d = {
            k: json.dumps({'annotator_luan': v})
            for k, v in val.items() if k != 'sentences'
Exemple #7
0
def read_scierc_data_to_FlairSentences(jsonl_file: str) -> Dataset:
    dataset: Dataset = [
        sent for d in data_io.read_jsons_from_file(jsonl_file)
        for sent in build_flair_sentences(d)
    ]
    return dataset
Exemple #8
0
def download_natural_questions(data_path='data'):
    data_path = '/home/tilo/Downloads/v1.0_sample_nq-train-sample.jsonl.gz'
    data = list(data_io.read_jsons_from_file(data_path))
    print()
Exemple #9
0
def get_flair_sentences(file):
    return [seq for d in data_io.read_jsons_from_file(file) for seq in build_flair_sentences(d)]