def load_data_to(self, ctxs: Dict[object, BiEncoderPassage]): tokenizer = BartTokenizerFast.from_pretrained("facebook/bart-base") if self.n_random_papers: print("Random newspaper subset...") scan_names = [] for file_path in tqdm(self.file_paths): with open(file_path, 'rb') as f: items = ijson.kvitems(f, '') for k, v in items: scan_names.append(k) papers = list(set([self.get_paper_name(scan) for scan in scan_names])) papers.sort() print(f"{len(papers)} total papers...") random.seed(789) random_papers = random.sample(papers, self.n_random_papers) print(f"Selected random papers: {random_papers}") print("Creating bi-encoder dict...") for file_path in tqdm(self.file_paths): with open(file_path, 'rb') as f: items = ijson.kvitems(f, '') ocr_text_generators = [] for k, v in items: if self.month_str: if self.month_str in k: if self.n_random_papers: if self.get_paper_name(k) in random_papers: ocr_text_generators.append(self.ocr_text_iter(v)) else: ocr_text_generators.append(self.ocr_text_iter(v)) else: if self.n_random_papers: if self.get_paper_name(k) in random_papers: ocr_text_generators.append(self.ocr_text_iter(v)) else: ocr_text_generators.append(self.ocr_text_iter(v)) if len(ocr_text_generators) == 0: continue for gen in ocr_text_generators: for layobj in gen: title, passage, object_id = layobj uid = object_id if self.normalize: title = normalize_passage(title) title = title.lower() passage = take_max_model_paragraphs(passage, tokenizer) passage = normalize_passage(passage) ctxs[uid] = BiEncoderPassage(passage, title)
def create_database(self) -> DB: self._pre_run() with open( self._all_cards_path, 'r', encoding = 'UTF-8' ) as all_cards_file, open( self._all_sets_path, 'r', encoding = 'UTF-8' ) as all_sets_file: handler = logging.FileHandler(self._logging_path, mode = 'w') parse_logger.addHandler(handler) try: raw_cards = ijson.kvitems(all_cards_file, 'data') cards = self.create_card_table(raw_cards) all_cards_file.seek(0) raw_cards = ijson.kvitems(all_cards_file, 'data') cardboards = self.create_cardboard_table(raw_cards, cards) artists = self.create_table_for_model(self._model_parser_map[i.Artist]) blocks = self.create_table_for_model(self._model_parser_map[i.Block]) printings = self.create_table_for_model(self._model_parser_map[i.Printing]) raw_expansions = ijson.kvitems(all_sets_file, 'data') expansions = self.create_expansion_table( raw_expansions = raw_expansions, cardboards = cardboards, printings = printings, artists = artists, blocks = blocks, ) return self._create_database_from_tables( { 'cards': cards, 'cardboards': cardboards, 'printings': printings, 'artists': artists, 'blocks': blocks, 'expansions': expansions, } ) finally: parse_logger.removeHandler(handler)
def load_data_to(self, ctxs: Dict[object, BiEncoderPassage], date): year = "_" + str(datetime.strptime(date, "%b-%d-%Y").year) + "_" tokenizer = BartTokenizerFast.from_pretrained("facebook/bart-base") print(f"Creating bi-encoder dict for {date}...") for file_path in tqdm(self.file_paths): if year in file_path: with open(file_path, 'rb') as f: items = ijson.kvitems(f, '') ocr_text_generators = [] for k, v in items: if date in k: ocr_text_generators.append(self.ocr_text_iter(v)) if len(ocr_text_generators) == 0: continue for gen in ocr_text_generators: for layobj in gen: title, passage, object_id = layobj uid = object_id title = normalize_passage(title) title = title.lower() passage = take_max_model_paragraphs(passage, tokenizer) passage = normalize_passage(passage) ctxs[uid] = BiEncoderPassage(passage, title)
def main(filename): with open(filename, 'r') as file: objects = ijson.kvitems(file, 'wellFormedAnswers') valid_old_key_to_new_key = {} new_key = 0 for key, well_formed_answer in objects: value = well_formed_answer if isinstance(well_formed_answer, list) else literal_eval(well_formed_answer) if len(value) > 0: valid_old_key_to_new_key[key] = str(new_key) new_key += 1 filtered_data = {} fieldnames = ['query', 'query_type', 'answers', 'wellFormedAnswers', 'passages'] for fieldname in fieldnames: add_data(filename, filtered_data, fieldname, valid_old_key_to_new_key) with open(filename, 'w') as fw: json.dump(filtered_data, fw)
def load_data_to(self, ctxs: Dict[object, BiEncoderPassage]): for file_path in self.file_paths: with open(file_path, 'rb') as f: items = ijson.kvitems(f, '') ocr_text_generators = [ ((ik['image_file_name'], ik['ocr_text'], ik['object_id']) for ik in v if ik['label']=='article') for k, v in items ] for gen in ocr_text_generators: for layobj in gen: title, passage, object_id = layobj uid = str(object_id) + '_' + title if self.normalize: passage = normalize_passage(passage) ctxs[uid] = BiEncoderPassage(passage[:self.passage_char_max], title)
def update_prices(self, start_of_week: datetime.date): logger.info("Querying DB for most recent prices") with connection.cursor() as cursor: cursor.execute(""" SELECT card_printing.id, face_printing.uuid, latest_price.date FROM cards_cardprinting card_printing JOIN cards_cardfaceprinting face_printing ON face_printing.card_printing_id = card_printing.id LEFT JOIN cards_cardprice latest_price ON latest_price.id = card_printing.latest_price_id """) recent_price_map = { uuid: (printing_id, most_recent_date) for printing_id, uuid, most_recent_date in cursor.fetchall() } logger.info("Updating prices") # We need to check which printings we've already done in case there are two faces # and therefore two price rows the same printing and we don't want to duplicate the prices updated_printings = set() with open(_paths.PRICES_JSON_PATH, "r", encoding="utf8") as prices_file: cards = ijson.kvitems(prices_file, "data") for uuid, price_data in cards: if uuid not in recent_price_map: logger.warning("No printing found for %s", uuid) continue printing_id, latest_price = recent_price_map[uuid] if printing_id in updated_printings: logger.info("Already updated %s. Skipping...", uuid) continue logger.info("Updating prices for %s", uuid) apply_printing_prices(start_of_week, price_data, printing_id, latest_price) updated_printings.add(printing_id)
def _fetch_history(self, send_data_conn, request_queue, history_file_path): """prepare 1 batch ahead, when received request, immediately return the previously prepared batch and prepares the next batch. """ return_batch = {} while True: historyRange = request_queue.get() if type(historyRange) is Traffic_history_service.QueueDone: break assert isinstance(historyRange, RequestHistoryRange) send_data_conn.send(return_batch) return_batch = {} with open(history_file_path, "rb") as f: for index, (t, vehicles_state) in enumerate( ijson.kvitems(f, "", use_float=True)): if (historyRange.start_index <= index and index < historyRange.start_index + historyRange.batch_count): return_batch[t] = vehicles_state send_data_conn.close()
def fetch_agent_missions(history_file_path: str, scenario_root_path: str, mapLocationOffset): assert os.path.isdir(scenario_root_path) history_mission_filepath = os.path.join(scenario_root_path, "history_mission.pkl") if not os.path.exists(history_mission_filepath): history_mission = {} else: with open(history_mission_filepath, "rb") as f: history_mission = pickle.load(f) if history_file_path in history_mission: return history_mission[history_file_path] vehicle_missions = {} with open(history_file_path, "rb") as f: for t, vehicles_state in ijson.kvitems(f, "", use_float=True): for vehicle_id in vehicles_state: if vehicle_id in vehicle_missions: continue vehicle_missions[vehicle_id] = scenario.Mission( start=scenario.Start( Traffic_history_service.apply_map_location_offset( vehicles_state[vehicle_id]["position"], mapLocationOffset, ), scenario.Heading( vehicles_state[vehicle_id]["heading"]), ), goal=scenario.EndlessGoal(), start_time=float(t), ) history_mission[history_file_path] = vehicle_missions # update cached history_mission_file with open(history_mission_filepath, "wb") as f: pickle.dump(history_mission, f) return vehicle_missions
def __init__(self, history_file_path): self._history_file_path = history_file_path self._all_timesteps = set() self._current_traffic_history = {} self._prev_batch_history = {} # return if traffic history is not used if history_file_path is None: return self._log = logging.getLogger(self.__class__.__name__) send_data_conn, receive_data_conn = Pipe() self._receive_data_conn = receive_data_conn self._request_queue = Queue() self._fetch_history_proc = Process( target=self._fetch_history, args=( send_data_conn, self._request_queue, self._history_file_path, ), ) self._fetch_history_proc.daemon = True self._fetch_history_proc.start() self._range_start = 0 self._batch_size = 300 # initialize with open(self._history_file_path, "rb") as f: for index, (t, vehicles_state) in enumerate( ijson.kvitems(f, "", use_float=True)): self._all_timesteps.add(t) if (self._range_start <= index and index < self._range_start + self._batch_size): self._current_traffic_history[t] = vehicles_state self._range_start += self._batch_size # prepares the next batch self._prepare_next_batch() self._receive_data_conn.recv()
def rows(self): with open(self._dataset_spec["input_path"], "rb") as inf: for t, states in ijson.kvitems(inf, "", use_float=True): for state in states.values(): yield (t, state)
def print_tagged_data(): isentences = iter_sentences() tagdata = {} count = 0 with open(args.tags[0], "r", encoding="utf-8") as infile: seen = set() items = ijson.kvitems(infile, "item") for k, v in items: if k != "sentences": continue sid, english, spanish, credits, english_score, spanish_score = next(isentences) count += 1 if not count % 1000 and _INTERACTIVE: print(count, end="\r", file=sys.stderr) all_tags = [] first = True for s in v: for t in s["tokens"]: if first: offset = int(t["begin"]) first = False form = get_original_form(t, spanish, offset) pos_tags = [] for word in sorted(set([form, t["form"]])): pos_tags += tag_to_pos(t, word) if not pos_tags: continue pos_tags = sorted(list(set(pos_tags))) all_tags += pos_tags for pos_tag in pos_tags: pword, junk, plemma = pos_tag[1].partition("|") if not plemma: plemma = pword if "_" in plemma: for word, lemma in zip(pword.split("_"), plemma.split("_")): if word != lemma: all_tags.append(["split", f"{word}|{lemma}"]) else: all_tags.append(["split", f"{word}"]) grouped_tags = group_tags(all_tags) # ignore sentences with the same adj/adv/noun/verb lemma combination unique_tags = set() for pos, tags in grouped_tags.items(): if pos not in ["adj", "adv", "n", "v", "part-adj", "part-verb"]: continue for t in tags: word, junk, lemma = t.partition("|") if not lemma: lemma = word unique_tags.add(lemma) uniqueid = hash(":".join(sorted(unique_tags))) if uniqueid in seen: continue seen.add(uniqueid) interj = get_interjections(spanish) if interj: grouped_tags["interj"] = list(map(str.lower, interj)) tag_str = " ".join( [f":{tag}," + ",".join(items) for tag, items in grouped_tags.items()] ) print(f"{english}\t{spanish}\t{credits}\t{english_score}\t{spanish_score}\t{tag_str}")
def add_data(filename, filtered_data, fieldname, valid_old_key_to_new_key): with open(filename, 'r') as f: objects = ijson.kvitems(f, fieldname) filtered_data[fieldname] = { valid_old_key_to_new_key[key]: query for key, query in objects if key in valid_old_key_to_new_key }
import ijson from tqdm import tqdm import argparse parser = argparse.ArgumentParser() parser.add_argument('-c', '--captions-path', type=str, required=True, help='path to unfiltered captions') parser.add_argument('-s', '--save-path', type=str, required=True, help='path to save filtered captions') args = parser.parse_args() captions_path = args.captions_path save_path = args.save_path ids = os.listdir('saved_features') filtered_captions = {} with open(captions_path, 'r') as input_file: captions_json = ijson.kvitems(input_file, '') for vid_id, captions in tqdm(captions_json): if vid_id in ids: filtered_captions[vid_id] = captions json.dump(filtered_captions, open(save_path, 'w'))
def read_json_by_item(f: io.StringIO) -> Iterator[Tuple[Any, Any]]: yield from ijson.kvitems(f, '')