def partition_CAR(args): placeholder_out = '' # REPLACE WITH ARGS cbor_file = 'paragraphCorpus/dedup.articles-paragraphs.cbor' # Will fill files until they have 2e9 Bytes i = 0 # Starting index for files num_bytes = 3e9 # Just for initialization size_threshold = 2e9 # Number of bytes to signal creation of new output .tsv file for para in read_data.iter_paragraphs( open(args.ctx_files_dir + cbor_file, 'rb')): if num_bytes >= size_threshold: i += 1 # Will start at 1 print(i) print(num_bytes) if i >= 2: out_file.close( ) # Close the file that surpassed size_threshold else: pass out_file = open(args.ctx_files_dir + 'CAR_collection_{}.tsv'.format(i), 'wt', encoding='utf-8') # Needed to add encoding tsv_writer = csv.writer(out_file, delimiter='\t') # Write to file tsv_writer.writerow( ['CAR' + '_' + para.para_id, ' '.join(para.get_text().split())]) # ["CAR_PID", "passage"] num_bytes = os.path.getsize(args.ctx_files_dir + 'CAR_collection_{}.tsv'.format(i)) return None
def iterate_paragraphs( self, paragraph_cbor_file, func: Callable[[Paragraph, List[Union[ParaLink, ParaText]]], Any], max_paras: Optional[int] = None) -> List[Tuple[str, Any]]: """ :param paragraph_cbor_file: Location of the paragraphCorpus.cbor file """ processed_paragraphs = 0 unique_paragraphs_seen = 0 total = len(self.paragraphs_to_consider) result = [] with open(paragraph_cbor_file, 'rb') as f: for p in iter_paragraphs(f): processed_paragraphs += 1 if processed_paragraphs % 100000 == 0: print("(Searching paragraph cbor): {}".format( processed_paragraphs)) if max_paras and processed_paragraphs >= max_paras: break if p.para_id in self.paragraphs_to_consider: for para in self.paragraphs_to_consider[p.para_id]: result.append((p.para_id, func(para, p.bodies))) unique_paragraphs_seen += 1 if unique_paragraphs_seen == total: break return result
def get_paragraphs(paragraphs_file): with open(paragraphs_file, 'rb') as f: for p in iter_paragraphs(f): texts = [ elem.text if isinstance(elem, ParaText) else elem.anchor_text for elem in p.bodies ] yield p.para_id + '|__|' + (' '.join(texts))
def get_mapping(file_path: str, aspect_to_entity_dict: Dict[str, str]): with open(file_path, 'rb') as cbor: for para in tqdm.tqdm(read_data.iter_paragraphs(cbor), total=10000): for body in para.bodies: if isinstance( body, read_data.ParaLink) and body.link_section is not None: aspect_to_entity_dict[body.link_section] = body.pageid
def parse(self, f): for paragraph in iter_paragraphs(f): for body in paragraph.bodies: if isinstance(body, ParaLink): # print(body) if body.link_section is not None: print("Link Section: {}".format(body.link_section)) print("Link Text: {}".format(body.get_text())) print("Link Anchor: {}".format(body.anchor_text)) print("Link Page: {}".format(body.page))
def create_database(corpus: str, save: str): with open(corpus, 'rb') as cbor: id_to_name_dict: Dict[str, str] = dict( (body.pageid, body.page) for para in tqdm.tqdm(read_data.iter_paragraphs(cbor), total=total) for body in para.bodies if isinstance(body, read_data.ParaLink)) write_to_file(id_to_name_dict, save) print('File written to: {}'.format(save))
def quick_ids(self): counter = 0 myset = set() with open(self.cbor_loc, 'rb') as f: for paragraph in iter_paragraphs(f): counter += 1 if counter > 10: break myset.add(paragraph.para_id) return myset
def explore(self): progress = 0 offset = 0 with open(self.cbor_loc, 'rb') as f: for paragraph in iter_paragraphs(f): wee = self.get_entities(paragraph) self.extract_from_text(paragraph, wee) progress += 1 if progress % 10 == 0: break
def print_paragraphs(path=WikiParagrahs.file_path_list[0], limit=1): print('*** reading {} paragraph from file: {} ***'.format(limit, path)) with open(path, 'rb') as f: counter = 1 for p in iter_paragraphs(f): print() print('*** PRINTING PARAGRAPH {} ***'.format(counter)) print( '----------------------- PARAGRAPH ID -----------------------' ) print(p.para_id) print( '----------------------- RAW PARAGRAPH -----------------------' ) print(p) # Print just the text texts = [ elem.text if isinstance(elem, ParaText) else elem.anchor_text for elem in p.bodies ] print( '----------------------- TEXT -----------------------------') print(' '.join(texts)) print( '----------------------- ENTITIES -----------------------------' ) entities = [ elem.page for elem in p.bodies if isinstance(elem, ParaLink) ] print(entities) print( '----------------------- MIXED -----------------------------') mixed = [(elem.anchor_text, elem.page) if isinstance(elem, ParaLink) else (elem.text, None) for elem in p.bodies] print(mixed) if counter >= limit: break counter += 1
def process_paragraphs(self): if not self.freq_dict: para_dict = {} raw_data = {} para_text = {} with open(self.paragraph_file, 'rb') as f: for p in iter_paragraphs(f): # entities = [elem.page # for elem in p.bodies # if isinstance(elem, ParaLink)]# how to retrieve entities from paragraph; p@5 get a bit higher # para_dict[p.para_id] = self.preprocess_text(p.get_text(), ret="freq") raw_data[p.para_id] = self.preprocess_text(p.get_text(), ret="raw") para_text[p.para_id] = p.get_text() self.freq_dict = para_dict self.raw_data = raw_data self.para_text = para_text
def retrieve_paragraph_mappings(self, cbor_loc): """ :param cbor_loc: Location of the paragraphCorpus.cbor file """ counter = 0 seen = 0 total = len(self.paragraphs_to_retrieve) with open(cbor_loc, 'rb') as f: for p in iter_paragraphs(f): counter += 1 if counter % 100000 == 0: print("(Searching paragraph cbor): {}".format(counter)) if p.para_id in self.paragraphs_to_retrieve: for p_to_be_updated in self.paragraphs_to_retrieve[ p.para_id]: self.update_paragraph(p_to_be_updated, p.bodies) seen += 1 if seen == total: break
def retrieve_text_matching_ids(self, ids: Set[str]): jsons = OrderedDict() # type: OrderedDict[str, str] out = open(self.json_dump_name + ".jsonl", 'w') counter = 0 with open(self.cbor_loc, 'rb') as f: for paragraph in iter_paragraphs(f): counter += 1 if paragraph.para_id in ids: jsons[paragraph.para_id] = self.create_json(paragraph) # stop once we've retrieved all of the paragraphs ids.remove(paragraph.para_id) if not ids: break for _, json in jsons.items(): out.write(json + "\n") out.close()
def build_d(read_path=WikiParagrahs.file_path_list[0], write_path=write_d_path, paragraph_limit=1): print('*** reading {} paragraph from file: {} ***'.format( paragraph_limit, read_path)) with open(read_path, 'rb') as f_read: with open(write_path, 'w') as f_write: counter = 1 for p in iter_paragraphs(f_read): if counter % 10000 == 0: print('{} / {} of paragraphs processed'.format( counter, paragraph_limit)) f_write.write(p.para_id + '\t' + p.get_text() + '\n') if counter >= paragraph_limit: break counter += 1
def dump_cbor(self): out = open(self.json_dump_name + ".jsonl", 'w') # index_dir = self.json_dump_name + "_index" pmap_out = open(self.json_dump_name + "_pmap.txt", 'w') # os.mkdir(index_dir) progress = 0 offset = 0 with open(self.cbor_loc, 'rb') as f: for paragraph in iter_paragraphs(f): to_json = self.create_json(paragraph) + "\n" out.write(to_json) pmap_out.write("{} {} {}\n".format(paragraph.para_id, offset, offset + len(to_json) - 1)) offset += len(to_json) progress += 1 if progress % 10000 == 0: print(progress) out.close() pmap_out.close()
def create_paragraph_map(self, paragraph_path): for i, page in enumerate( read_data.iter_paragraphs(open(paragraph_path, 'rb'))): self.paragraph_map[page.para_id] = page.get_text()
def create_para_id_list(paragraph_cbor_file: str) -> List[str]: with open(paragraph_cbor_file, 'rb') as f: return [p.para_id for p in iter_paragraphs(f)]