def tokenize_dataset(root, spacy_model): nlp = spacy.load(spacy_model) for topic in sorted(os.listdir(root)): print('TOPIC:', topic) if os.path.exists(root / topic / 'articles.jsonl.gz'): articles = list(utils.read_jsonl_gz(root / topic / 'articles.jsonl.gz')) elif os.path.exists(root / topic / 'articles.jsonl'): articles = list(utils.read_jsonl(root / topic / 'articles.jsonl')) else: continue jsonl_out_path = root / topic / 'articles.tokenized.jsonl' out_batch = [] for i, a in enumerate(articles): tokenized_doc = '' doc = nlp(a['text']) for sent in doc.sents: tokens = [tok.text for tok in sent if not tok.text.isspace()] tokenized_doc += ' '.join(tokens) + '\n' a['text'] = tokenized_doc.strip() out_batch.append(a) if i % 100 == 0: utils.write_jsonl(out_batch, jsonl_out_path, override=False) out_batch = [] print(i) utils.write_jsonl(out_batch, jsonl_out_path, override=False) gz_out_path = root / topic / 'articles.tokenized.jsonl.gz' utils.gzip_file(jsonl_out_path, gz_out_path, delete_old=True)
def times(self): articles = utils.read_jsonl(self.path / 'articles.preprocessed.jsonl') times = [] for a in articles: t = arrow.get(a['time']).datetime t = t.replace(tzinfo=None) times.append(t) return times
def _load_timelines(self): timelines = [] for raw_tl in utils.read_jsonl(self.path / 'timelines.jsonl'): if raw_tl: tl_items = [] for t, s in raw_tl: t = self.normalise_time(arrow.get(t)) tl_items.append((t, s)) tl = Timeline(tl_items) timelines.append(tl) return timelines
def _load_timelines(self): timelines = [] path = self.path / "timelines.jsonl" if not path.exists(): return [] for raw_tl in utils.read_jsonl(path): if raw_tl: tl_items = [] for t, s in raw_tl: t = self.normalise_time(arrow.get(t)) tl_items.append((t, s)) tl = Timeline(tl_items) timelines.append(tl) return timelines
def articles(self): path1 = self.path / "articles.preprocessed.jsonl" path2 = self.path / "articles.preprocessed.jsonl.gz" if path1.exists(): articles = utils.read_jsonl(path1) else: articles = utils.read_jsonl_gz(path2) for a_ in articles: a = load_article(a_) t = self.normalise_time(a.time) if self.start and t < self.start: continue if self.end and t > self.end: break yield a
def _load_timelines(self): timelines = [] path = self.path / 'timelines.jsonl' with open("data_log.txt", "a") as ftmp: print(path, file=ftmp) if not path.exists(): return [] for raw_tl in utils.read_jsonl(path): if raw_tl: with open("data_log.txt", "a") as ftmp: print(raw_tl, file=ftmp) tl_items = [] for t, s in raw_tl: t = self.normalise_time(arrow.get(t)) tl_items.append((t, s)) tl = Timeline(tl_items) timelines.append(tl) return timelines