def load_prepare_keyword_sentence_data(datafile: str, limit=100): corpus_name = "sci-keyword-sentencs" data_g = data_io.read_jsons_from_file(datafile, limit=100) voc = Voc(corpus_name) [voc.addWord(char) for d in data_g for char in d['sentence']] voc.trim(10) assert '#' in voc.word2index.keys() print("Counted words:", voc.num_words) # def infinite_data_g(): # while True: # for d in data_io.read_jsons_from_file(datafile): # yield d def normalizeString(s): s = re.sub(r"\s+", r" ", s).strip() s = ''.join([ c if c in voc.word2index else voc.index2word[UNK_token] for c in s ]) return s pairs = [(normalizeString(' '.join(d['keywords'])), normalizeString(d['sentence'])) for d in data_io.read_jsons_from_file(datafile, limit=limit)] print('got %d train-samples' % len(pairs)) def batch2TrainData_fun(voc, pair_batch): return batch2TrainData(voc, pair_batch, lambda s: list(s)) return voc, pairs, batch2TrainData_fun, normalizeString
def getting_scierc_tagged_data(jsonl_file): def build_sentences(d: Dict): g = [tok for sent in d['sentences'] for tok in sent] spaced_tokens = [ x for k, tok in enumerate(g) for x in [(tok, k), (' ', k + 0.5)] ] char_offsets = numpy.cumsum([0] + [len(x) for x, _ in spaced_tokens]) tok2charoff = { k: char_offsets[i] for i, (tok, k) in enumerate(spaced_tokens) } text = ''.join([t for t, _ in spaced_tokens]) spans = [(tok2charoff[token_start], tok2charoff[token_end + 1], label) for sent_spans in d['ner'] for token_start, token_end, label in sent_spans] relations = [(tok2charoff[s1], tok2charoff[e1 + 1], tok2charoff[s2], tok2charoff[e2 + 1], label) for sent_rel in d['relations'] for s1, e1, s2, e2, label in sent_rel] return d['doc_key'], text, spans, relations return [ build_sentences(d) for d in data_io.read_jsons_from_file(jsonl_file) ]
def get_scierc_data_as_flair_sentences(): # data_path = '/home/tilo/code/NLP/scisci_nlp/data/scierc_data/json/' data_path = '../data/scierc_data/json/' sentences = [sent for jsonl_file in ['train.json','dev.json','test.json'] for d in data_io.read_jsons_from_file('%s/%s' % (data_path,jsonl_file)) for sent in build_flair_sentences(d)] return sentences
def get_data(): # data_path = '/home/tilo/code/NLP/scisci_nlp/data/scierc_data/json' data_path = '/docker-share/data/scierc_data/json' # data_path = '/home/users/t/tilo-himmelsbach/data/scierc_data/json' sentences = [ sent for jsonl_file in ['train.json', 'dev.json', 'test.json'] for d in data_io.read_jsons_from_file('%s/%s' % (data_path, jsonl_file)) for sent in build_flair_sentences(d) ] return sentences
def load_embedded_data(path: str): raw_data = data_io.read_jsons_from_file(path + '/' + 'raw_batch_0.jsonl') tensor = torch.load(path + '/processed_batch_0.pt') print(tensor.shape) def add_key_value(d, k, v): d[k] = v return d return [(add_key_value(d, 'embedding', tensor[k])) for k, d in enumerate(raw_data)]
import json from commons import data_io from sqlalchemy_util.sqlalchemy_base import get_sqlalchemy_base_engine from sqlalchemy_util.sqlalchemy_methods import insert_or_update, get_tables_by_reflection if __name__ == '__main__': # ip = 'localhost' ip = '10.1.1.29' sqlalchemy_base, sqlalchemy_engine = get_sqlalchemy_base_engine(host=ip) data = data_io.read_jsons_from_file( '/home/tilo/code/NLP/scisci_nlp/data/scierc_data/json/train.json') data = ({**{'id': json.dumps(d.pop('doc_key'))}, **d} for d in data) data = (d for d in data if isinstance(d['id'], str)) table = get_tables_by_reflection(sqlalchemy_base.metadata, sqlalchemy_engine)['scierc'] # columns = [Column('id', String, primary_key=True)] + [Column(colname, String) for colname in ['sentences','ner','relations','clusters']] # table = Table('scierc', sqlalchemy_base.metadata, *columns, extend_existing=True) table.drop(sqlalchemy_engine) if not sqlalchemy_engine.has_table(table.name): print('creating table %s' % table.name) table.create() def update_fun(val, old_row): d = { k: json.dumps({'annotator_luan': v}) for k, v in val.items() if k != 'sentences'
def read_scierc_data_to_FlairSentences(jsonl_file: str) -> Dataset: dataset: Dataset = [ sent for d in data_io.read_jsons_from_file(jsonl_file) for sent in build_flair_sentences(d) ] return dataset
def download_natural_questions(data_path='data'): data_path = '/home/tilo/Downloads/v1.0_sample_nq-train-sample.jsonl.gz' data = list(data_io.read_jsons_from_file(data_path)) print()
def get_flair_sentences(file): return [seq for d in data_io.read_jsons_from_file(file) for seq in build_flair_sentences(d)]