def load_sentence(sent_dict): tokens = load_tokens(sent_dict["tokens"]) pub_time = utils.strip_to_date(arrow.get(sent_dict["pub_time"])) time = Sentence.get_time(tokens) time_level = None if time: time = arrow.get(time) time_format = Sentence.get_time_format(tokens) time_level = None if "d" in time_format: time = datetime.datetime(time.year, time.month, time.day) time_level = "d" elif ("m" in time_format) or ("y" in time_format): if "m" in time_format: start, end = time.span("month") time_level = "m" else: start, end = time.span("year") time_level = "y" start = datetime.datetime(start.year, start.month, start.day) end = datetime.datetime(end.year, end.month, end.day) time = (start, end) return Sentence(sent_dict["raw"], tokens, pub_time, time, time_level)
def load_sentence(sent_dict): tokens = load_tokens(sent_dict['tokens']) pub_time = utils.strip_to_date(arrow.get(sent_dict['pub_time'])) time = Sentence.get_time(tokens) time_level = None if time: time = arrow.get(time) time_format = Sentence.get_time_format(tokens) time_level = None if 'd' in time_format: time = datetime.datetime(time.year, time.month, time.day) time_level = 'd' elif ('m' in time_format) or ('y' in time_format): if 'm' in time_format: start, end = time.span('month') time_level = 'm' else: start, end = time.span('year') time_level = 'y' start = datetime.datetime(start.year, start.month, start.day) end = datetime.datetime(end.year, end.month, end.day) time = (start, end) return Sentence(sent_dict['raw'], tokens, pub_time, time, time_level)
def temporal_graph(self, X, times): times = [utils.strip_to_date(t) for t in times] time_to_ixs = collections.defaultdict(list) for i in range(len(times)): time_to_ixs[times[i]].append(i) n_items = X.shape[0] S = sparse.lil_matrix((n_items, n_items)) start, end = min(times), max(times) total_days = (end - start).days + 1 for n in range(total_days + 1): t = start + datetime.timedelta(days=n) window_size = min(self.max_days + 1, total_days + 1 - n) window = [ t + datetime.timedelta(days=k) for k in range(window_size) ] if n == 0 or len(window) == 1: indices = [i for t in window for i in time_to_ixs[t]] if len(indices) == 0: continue if sparse.issparse(X): X_n = sparse.vstack([X[i] for i in indices]) else: X_n = np.vstack([X[i] for i in indices]) S_n = cosine_similarity(X_n) n_items = len(indices) for i_x, i_n in zip(indices, range(n_items)): for j_x, j_n in zip(indices, range(i_n + 1, n_items)): S[i_x, j_x] = S_n[i_n, j_n] else: # prev is actually prev + new prev_indices = [i for t in window for i in time_to_ixs[t]] new_indices = time_to_ixs[window[-1]] if len(new_indices) == 0: continue if sparse.issparse(X): X_prev = sparse.vstack([X[i] for i in prev_indices]) X_new = sparse.vstack([X[i] for i in new_indices]) else: X_prev = np.vstack([X[i] for i in prev_indices]) X_new = np.vstack([X[i] for i in new_indices]) S_n = cosine_similarity(X_prev, X_new) n_prev, n_new = len(prev_indices), len(new_indices) for i_x, i_n in zip(prev_indices, range(n_prev)): for j_x, j_n in zip(new_indices, range(n_new)): S[i_x, j_x] = S_n[i_n, j_n] return sparse.csr_matrix(S)
def get_input_time_span(ref_dates, extension): ref_start = utils.strip_to_date(min(ref_dates)) ref_end = utils.strip_to_date(max(ref_dates)) input_start = ref_start - datetime.timedelta(days=extension) input_end = ref_end + datetime.timedelta(days=extension) return input_start, input_end