def fetch_max_t_of_prev_trading_day(session: Session, ric: str, t: datetime) -> int: assert (t.tzinfo == UTC) prev_day = t + timedelta(hours=9) - timedelta(days=1) return session \ .query(extract('epoch', func.max(Price.t))) \ .filter(cast(in_jst(Price.t), Date) <= prev_day.date(), Price.ric == ric) \ .scalar()
def fetch_prices_of_a_day(session: Session, ric: str, jst: datetime) -> List[Tuple[datetime, Decimal]]: results = session \ .query(func.to_char(in_utc(Price.t), 'YYYY-MM-DD HH24:MI:SS').label('t'), Price.val) \ .filter(cast(in_jst(Price.t), Date) == jst.date(), Price.ric == ric) \ .order_by(Price.t) \ .all() return [(datetime.strptime(r.t, '%Y-%m-%d %H:%M:%S').replace(tzinfo=UTC), r.val) for r in results]
def insert_headlines(session: Session, dir_nikkei_headline: Path, train_span: Span, valid_span: Span, test_span: Span, logger: Logger) -> None: dests = list(dir_nikkei_headline.glob('*.csv.gz')) + list( dir_nikkei_headline.glob('*.csv')) for dest in dests: with gzip.open(str(dest), mode='rt') if dest.suffix == '.gz' else dest.open( mode='r') as f: N = sum(1 for _ in f) - 1 f.seek(0) reader = csv.reader(f, delimiter=',', quoting=csv.QUOTE_ALL) next(reader) fields = next(reader) t = fields[1] if 'Z' not in t or '+' not in t: t = t + '+0000' t = datetime.strptime(t, NIKKEI_DATETIME_FORMAT).astimezone(JST) first = session \ .query(Headline) \ .filter(extract('year', in_jst(Headline.t)) == t.year) \ .first() if first is not None: return logger.info('start {}'.format(f.name)) f.seek(0) next(reader) headlines = [] for _ in tqdm(range(N)): fields = next(reader) t = fields[1] if 'Z' not in t or '+' not in t: t = t + '+0000' article_id = fields[5] headline = fields[6] isins = None if fields[25] == '' else fields[25].split(':') countries = None if fields[36] == '' else fields[36].split(':') categories = None if fields[37] == '' else fields[37].split( ':') keywords_headline = None if fields[-2] == '' else fields[ -2].split(':') keywords_article = None if fields[-1] == '' else fields[ -1].split(':') try: t = datetime.strptime(t, NIKKEI_DATETIME_FORMAT) except ValueError: message = 'ValueError: {}, {}, {}' logger.info(message.format(t, article_id, headline)) continue if train_span.start <= t and t < train_span.end: phase = Phase.Train.value elif valid_span.start <= t and t < valid_span.end: phase = Phase.Valid.value elif test_span.start <= t and t < test_span.end: phase = Phase.Test.value else: phase = None headlines.append({ 'article_id': article_id, 't': t, 'headline': headline, 'isins': isins, 'countries': countries, 'categories': categories, 'keywords_headline': keywords_headline, 'keywords_article': keywords_article, 'is_used': None, 'phase': phase }) session.execute(Headline.__table__.insert(), headlines) session.commit()
def load_alignments_from_db(session: Session, phase: Phase, logger: Logger) -> List[Alignment]: headlines = session \ .query(Headline.article_id, Headline.tag_tokens, Headline.t, cast(extract('epoch', Headline.t), Integer).label('unixtime'), cast(extract('hour', in_jst(Headline.t)), Integer).label('jst_hour')) \ .filter(Headline.is_used.is_(True), Headline.phase == phase.value) \ .order_by(Headline.t) \ .all() headlines = list(headlines) rics = fetch_rics(session) alignments = [] seqtypes = [ SeqType.RawShort, SeqType.RawLong, SeqType.MovRefShort, SeqType.MovRefLong, SeqType.NormMovRefShort, SeqType.NormMovRefLong, SeqType.StdShort, SeqType.StdLong ] logger.info( 'start creating alignments between headlines and price sequences.') for h in tqdm(headlines): # Find the latest prices before the article is published chart = dict([ fetch_latest_vals(session, h.t, ric, seqtype) for (ric, seqtype) in itertools.product(rics, seqtypes) ]) # Replace tags with price tags tag_tokens = h.tag_tokens short_term_vals = chart[stringify_ric_seqtype(Code.N225.value, SeqType.RawShort)] long_term_vals = chart[stringify_ric_seqtype(Code.N225.value, SeqType.RawLong)] processed_tokens = [] for i in range(len(tag_tokens)): t = tag_tokens[i] if t.startswith('<yen val="') and t.endswith('"/>'): ref = fromstring(t).attrib['val'] if len(short_term_vals) > 0 and len(long_term_vals) > 0: prev_trading_day_close = Decimal(long_term_vals[0]) latest = Decimal(short_term_vals[0]) p = find_operation(ref, prev_trading_day_close, latest) processed_tokens.append(p) else: processed_tokens.append('<yen val="z"/>') else: processed_tokens.append(tag_tokens[i]) alignment = Alignment(h.article_id, str(h.t), h.jst_hour, processed_tokens, chart) alignments.append(alignment.to_dict()) logger.info( 'end creating alignments between headlines and price sequences.') return alignments
def article_evaluation(article_id: str, method: str, is_debug: bool) -> flask.Response: if method == 'POST': h = db \ .session \ .query(HumanEvaluation) \ .filter(HumanEvaluation.article_id == article_id) \ .one() form = flask.request.form nth = dict([(method_name, i + 1) for (i, method_name) in enumerate(h.ordering)]) note = form.get('note') h.note = None \ if note is None or note.strip() == '' \ else note fluency = form.get('fluency') h.fluency = None \ if fluency is None or fluency.strip() == '' \ else fluency informativeness = form.get('informativeness') h.informativeness = None \ if informativeness is None or informativeness.strip() == '' \ else informativeness r = db \ .session \ .query(GenerationResult) \ .filter(GenerationResult.article_id == article_id, GenerationResult.method_name == 'Base') \ .one() r.correctness = form.get('correctness-{}'.format(nth['Base'])) g = db \ .session \ .query(GenerationResult) \ .filter(GenerationResult.article_id == article_id, GenerationResult.method_name == 'Gold') \ .one() g.correctness = form.get('correctness-{}'.format(nth['Gold'])) e = db \ .session \ .query(GenerationResult) \ .filter(GenerationResult.article_id == article_id, GenerationResult.method_name == 'Ours') \ .one() e.correctness = form.get('correctness-{}'.format(nth['Ours'])) db.session.commit() referrer = flask.request.form.get('referrer', '/') return flask.redirect(referrer) else: headline = db \ .session \ .query(Headline.article_id, Headline.simple_headline.label('gold_result'), Headline.t, func.to_char(in_jst(Headline.t), 'YYYY-MM-DD HH24:MI:SS').label('s_jst')) \ .filter(Headline.article_id == article_id) \ .one() ric_tables = create_ric_tables(db.session, config.rics, ric_to_ric_info, headline.t) group_size = 3 while len(ric_tables) % 3 != 0: ric_tables.append(Table('', '', '', [], is_dummy=True)) ric_table_groups = [ ric_tables[i:i + group_size] for i in range(0, len(ric_tables), group_size) ] # It is better to share one procedure with the search, # but we keep this procedure for convenience target = db \ .session \ .query(HumanEvaluation) \ .filter(HumanEvaluation.article_id == article_id) \ .one_or_none() targets = [] method_names = ['Gold'] if target.ordering is None else target.ordering if is_debug: method_names = order_method_names_for_debug(method_names) d = dict() m = [] for method_name in method_names: res = db \ .session \ .query(GenerationResult.article_id, GenerationResult.result, GenerationResult.correctness) \ .filter(GenerationResult.article_id == article_id, GenerationResult.method_name == method_name) \ .one_or_none() if res is not None: text = headline.gold_result \ if method_name == 'Gold' \ else res.result d[method_name] = EvalTarget(method_name, text, is_debug) m.append(method_name) note = '' if target.note is None else target.note fluency = '' if target.fluency is None else target.fluency informativeness = '' if target.informativeness is None else target.informativeness targets = [(i + 1, d[method_name]) for (i, method_name) in enumerate(m)] return flask.render_template( 'human_evaluation.pug', title='debug' if is_debug else 'human-evaluation', article_id=headline.article_id, timestamp=headline.s_jst + ' JST', targets=targets, fluency=fluency, informativeness=informativeness, note=note, ric_table_groups=ric_table_groups)
def list_targets_of_human_evaluation(is_debug: bool) -> flask.Response: args = flask.request.args page = int(args.get('page', default=1)) conditions = [] for i in range(5): field = args.get('field' + str(i)) relation = args.get('rel' + str(i)) val = args.get('val' + str(i)) if field is not None and relation is not None and val is not None: constraint = construct_constraint_query(field.strip(), relation.strip(), val.strip()) conditions.append(constraint) q = db \ .session \ .query(HumanEvaluation.article_id, HumanEvaluation.ordering, HumanEvaluation.is_target, (func.coalesce(HumanEvaluation.fluency, '')).label('fluency'), (func.coalesce(HumanEvaluation.informativeness, '')).label('informativeness'), (func.coalesce(HumanEvaluation.note, '')).label('note'), Headline.simple_headline.label('gold_result'), Headline.phase, func.to_char(in_jst(Headline.t), 'YYYY-MM-DD HH24:MI').label('jst')) \ .outerjoin(Headline, HumanEvaluation.article_id == Headline.article_id) \ .filter(Headline.is_used.is_(True), *conditions) \ .order_by(Headline.t) n_results = q.count() per_page = config.n_items_per_page articles = [] for h in q.limit(per_page).offset((page - 1) * per_page).all(): method_names = ['Gold'] if h.ordering is None else h.ordering if is_debug: method_names = order_method_names_for_debug(method_names) eval_targets = [] for method_name in method_names: res = db \ .session \ .query(GenerationResult.article_id, GenerationResult.result, GenerationResult.correctness) \ .filter(GenerationResult.article_id == h.article_id, GenerationResult.method_name == method_name) \ .one_or_none() if res is None: et = EvalTarget(method_name, h.gold_result, None) \ if method_name == 'Gold' \ else EvalTarget(method_name, '', None) else: text = h.gold_result \ if method_name == 'Gold' \ else res.result et = EvalTarget(method_name, text, is_debug) eval_targets.append(et) is_finished = \ len(list(config.result.keys()) + ['Gold']) == \ len(h.fluency) > 0 \ and len(h.informativeness) > 0 e = EvalListRow(h.article_id, h.jst, h.phase, eval_targets, h.fluency, h.informativeness, h.note, h.is_target, is_finished) articles.append(e) if n_results == 0: display_msg = 'No headline is found' else: offset = (page - 1) * per_page + 1 end = offset + per_page - 1 if page < (n_results // per_page) else n_results display_msg = 'Displaying {:,} to {:,} of {:,}'.format( offset, end, n_results) pagination = DummyPagination(has_prev=page > 1, has_next=page < (n_results // per_page), display_msg=display_msg) return flask.render_template( 'list_human_evaluation.pug', title='debug' if is_debug else 'human-evaluation', condition=conditions, articles=articles, pagination=pagination)
def fetch_close(session: Session, ric: str, jst: datetime) -> float: result = session \ .query(Price.val) \ .filter(cast(in_jst(Close.t), Date) == jst.date(), Close.ric == ric, Close.t == Price.t, Price.ric == ric) \ .scalar() return float(result) if result is not None else None