def __init__(self, key, engine_id): if key is None: raise CseError("API key cannot be None") self.key = key self.engine_id = engine_id self.config = None self.text_util = TextUtil()
def step_3_extract_origin_bodies(args): """ Read origin articles from work directory and extract clean text from them. """ origin_html_dir = os.path.join(args.work_dir, ORIGIN_HTML_DIR) origin_body_dir = os.path.join(args.work_dir, ORIGIN_BODY_DIR) text_util = TextUtil() origins = read_origins(args) clean_directory(origin_body_dir) for i, url in enumerate(origins): i_html_fp = os.path.join(origin_html_dir, "%d.html" % i) o_body_fp = os.path.join(origin_body_dir, "%d.json" % i) with open(i_html_fp, "rb") as i_fl: html = i_fl.read() body, lang_id = text_util.extract_body(url, html) with open(o_body_fp, "wb") as o_fl: json_dump({ "body": body, "lang_id": lang_id, }, o_fl) logging.info("Extracted %d bodies." % len(origins))
def __init__(self, entries): self.id2entry = {} self.text_index = {} self.text_util = TextUtil() for entry in entries: self.id2entry[entry.ref_id] = entry
def compute_title_prf(eval_data): text_util = TextUtil() gold_data = [entry[0] for entry in eval_data] gold_eval_out = [None] * len(eval_data) eval_out = [] for method_id in xrange(1, len(eval_data[0])): method_data = [entry[method_id] for entry in eval_data] method_eval_out = [] found_n = 0 correct_n = 0 data_size = 0 for i in xrange(len(eval_data)): gold = gold_data[i] pred = method_data[i] gold = text_util.simplified_text( gold) if gold != "<NONE>" and gold != "" else None pred = text_util.simplified_text( pred) if pred != "<NONE>" and pred != "" else None if gold is not None: data_size += 1 if pred is not None: found_n += 1 correct = compare_titles(pred, gold, 3, True) or compare_titles( pred, gold, 3, False) if correct: correct_n += 1 gold_eval_out[i] = gold method_eval_out.append((pred, str(int(not correct)))) p = 0 if found_n == 0 else float(correct_n) / float(found_n) r = 0 if data_size == 0 else float(correct_n) / float(data_size) f = 0 if p + r == 0 else p * r / (p + r) * 2 prf = (p, r, f) eval_out.append((prf, method_eval_out)) return gold_eval_out, eval_out
def compute_title_prf(eval_data): text_util = TextUtil() gold_data = [entry[0] for entry in eval_data] gold_eval_out = [None] * len(eval_data) eval_out = [] for method_id in xrange(1, len(eval_data[0])): method_data = [entry[method_id] for entry in eval_data] method_eval_out = [] found_n = 0 correct_n = 0 data_size = 0 for i in xrange(len(eval_data)): gold = gold_data[i] pred = method_data[i] gold = text_util.simplified_text(gold) if gold != "<NONE>" and gold != "" else None pred = text_util.simplified_text(pred) if pred != "<NONE>" and pred != "" else None if gold is not None: data_size += 1 if pred is not None: found_n += 1 correct = compare_titles(pred, gold, 3, True) or compare_titles(pred, gold, 3, False) if correct: correct_n += 1 gold_eval_out[i] = gold method_eval_out.append((pred, str(int(not correct)))) p = 0 if found_n == 0 else float(correct_n) / float(found_n) r = 0 if data_size == 0 else float(correct_n) / float(data_size) f = 0 if p + r == 0 else p * r / (p + r) * 2 prf = (p, r, f) eval_out.append((prf, method_eval_out)) return gold_eval_out, eval_out
def step_4_extract_sentences(args): """ Extract sentences and quotes and segment them. """ origin_body_dir = os.path.join(args.work_dir, ORIGIN_BODY_DIR) origin_sentence_dir = os.path.join(args.work_dir, ORIGIN_SEGMENT_DIR) origins = read_origins(args) text_util = TextUtil() clean_directory(origin_sentence_dir) for i, url in enumerate(origins): i_body_fp = os.path.join(origin_body_dir, "%d.json" % i) o_segment_fp = os.path.join(origin_sentence_dir, "%d.json" % i) with open(i_body_fp, "rb") as i_fl: body_obj = json.load(i_fl) with open(o_segment_fp, "wb") as o_fl: body = body_obj["body"] lang_id = body_obj["lang_id"].encode("utf-8") sentences = text_util.sent_tokenize(body) quoted = text_util.extract_quoted(body) segments = text_util.select_segments(sentences, quoted) ######################################################### #TODO: Cut long segments. Move this to function or method for i in xrange(len(segments)): segment = segments[i] segment = text_util.simplified_text(segment, remove_punct=False) words = segment.split() if len(words) > 32: words = words[:32] segments[i] = " ".join(words) ######################################################## json_dump({ "url": url, "text": body, "lang_id": lang_id, "sentences": sentences, "quoted": quoted, "segments": segments, }, o_fl) logging.info(("Extracted: %02d sent %02d quot %02d segm." % ( len(sentences), len(quoted), len(segments) )).encode("utf-8"))
class CseAPI(object): BASE_URL = "https://www.googleapis.com/customsearch/v1?" def __init__(self, key, engine_id): if key is None: raise CseError("API key cannot be None") self.key = key self.engine_id = engine_id self.config = None self.text_util = TextUtil() @staticmethod def from_config(api_config): api = CseAPI(key=api_config["googleApiKey"], engine_id=api_config["googleEngineId"]) api.config = GoogleApiConfig(api_config) return api def make_query(self, query_string=None, country=None, date_start=None, date_end=None, exact_terms=None, ): q = { "q": query_string, "cr": country, # Country restrict(s) "dateRestrict": None, # Specifies all search results are from a time period (string) "exactTerms": exact_terms, # Identifies a phrase that all documents in the search results must # contain (string) "excludeTerms": None, # Identifies a word or phrase that should not appear in any documents # in the search results (string) "fileType": None, # Returns images of a specified type. Some of the allowed values are: # bmp, gif, png, jpg, svg, pdf, ... (string) "num": None, # Number of search results to return (integer), "orTerms": None, # Provides additional search terms to check for in a document, where each # document in the search results must contain at least one of the additional # search terms (string) "relatedSite": None, # Specifies that all search results should be pages that are related to the # specified URL (string) "sort": None, # The sort expression to apply to the results (string) "start": None, # The index of the first result to return (integer) } for key, value in q.items(): if value is None: del q[key] return q def make_query_pages(self, results_number=10, start=1): start = 1 page_size = 10.0 pages = math.ceil(float(results_number) / page_size) for page in xrange(int(pages)): yield int(page * page_size + 1) def make_cse_url(self, query): params = urllib.urlencode(query) return self.BASE_URL + params def find_results(self, query, max_results=1000, upper_threshold=1000, bottom_threshold=100, query_size_heuristic=10): result_urls = [] found_results = [] total_results = None query_size = self.text_util.words_count(query["exactTerms"]) for start_index in self.make_query_pages(max_results): if total_results is not None and start_index > total_results: break query["start"] = start_index url = self.make_cse_url(query) + "&cx=%s&key=%s" % (self.engine_id, self.key) logging.debug(url) result_urls.append(url) try: api_response = requests.get(url).json() except Exception: logging.error("Error while calling '%s':%s" % (url, traceback.format_exc())) api_response = None if api_response is None: logging.warn("API response is None") break if "searchInformation" not in api_response: logging.warn("Search information not found") if total_results is None: break if total_results is None: total_results = int(api_response["searchInformation"]["totalResults"]) if total_results > upper_threshold: logging.warn("Total results it too large %d" % total_results) break if total_results > bottom_threshold and query_size < query_size_heuristic: logging.warn("Too many results for too short query %d" % total_results) break if "items" not in api_response or len(api_response["items"]) == 0: if total_results is None: logging.warn("Items not found. Stop.") break else: logging.warn("Item not found. Skip.") continue found_results.extend(api_response["items"]) return found_results, result_urls, 0 if total_results is None else total_results def __repr__(self): return "<CseAPI(key='%s', engine='%s')>" % (self.key[:8]+"..", self.engine_id[:8]+"..")
def step_9_enrich_story_graphs(args): origins = read_origins(args) graph_dir = os.path.join(args.work_dir, CROSSREF_OUT_DATA_DIR) story_dir = os.path.join(args.work_dir, STORY_GRAPH_DIR) clean_directory(story_dir) fetcher = PageFetcher() textutil = TextUtil() for i, origin_url in enumerate(origins): logging.info("Processing graph #%d" % i) graph_fp = os.path.join(graph_dir, "%d.json" % i) story_fp = os.path.join(story_dir, "%d.json" % i) with open(graph_fp, "rb") as graph_fl: graph = json.load(graph_fl) # Pre-process edges edges = set() for a, b in graph["edges"]: edges.add((min(a, b), max(a, b))) ref_counter = collections.Counter() for a, b in edges: ref_counter[a] += 1 ref_counter[b] += 1 # Find central reference, also index by url central_ref_id = None url_index = {} for node in graph["nodes"].itervalues(): node_url = node["url"] url_index[node_url] = node if node_url == origin_url: central_ref_id = node["refId"] if central_ref_id is None: logging.warn("Central reference not found. Selecting the most referenced.") central_ref_id = ref_counter.most_common(1)[0][0] logging.info(central_ref_id) urls = [node["url"] for node in graph["nodes"].itervalues()] # Download FB reactions fb_counts = fetcher.get_facebook_counts(urls) logging.info("Fetched %d FB reactions" % len([fc for fc in fb_counts if fc is not None])) fb_comments = {} fb_shares = {} for fb_entry in (fc for fc in fb_counts if fc is not None): n_comments = fb_entry.get("comments", 0) n_shares = fb_entry.get("shares", 0) try: ref_id = url_index[fb_entry.get("id")]["refId"] except KeyError: try: ref_id = url_index[fb_entry.get("url")]["refId"] except KeyError: continue fb_comments[ref_id] = n_comments fb_shares[ref_id] = n_shares # Download TW reactions tw_counts = fetcher.get_twitter_counts(urls) logging.info("Fetched %d TW reactions" % len([te for te in tw_counts if te is not None])) tw_shares = {} for tw_entry in (te for te in tw_counts if te is not None): n_shares = tw_entry.get("count", 0) tw_url = tw_entry["url"] node = url_index.get(tw_url) if node is None: node = url_index.get(tw_url[:-1]) if node is None: continue ref_id = node["refId"] tw_shares[ref_id] = n_shares # Generate date distribution # Find additional connection features nodes = {} author2refs = {} author2sources = {} source2refs = {} source2authors = {} for node in graph["nodes"].itervalues(): ref_id = node["refId"] node["twitterShares"] = tw_shares.get(ref_id, 0) node["facebookShares"] = fb_shares.get(ref_id, 0) node["facebookComments"] = fb_comments.get(ref_id, 0) node["referenceCount"] = ref_counter.get(ref_id, 0) node["citationCount"] = 0 node["shareCount"] = node["twitterShares"] + node["facebookShares"] nodes[node["refId"]] = node node["authors"] = sorted(node["authors"]) node["sources"] = list(reversed(sorted(node["sources"]))) source = None if len(node["sources"]) > 0: source = node["sources"][0] for author in node["authors"]: if author not in author2refs: author2refs[author] = {ref_id} else: author2refs[author].add(ref_id) if source is not None: if author not in author2sources: author2sources[author] = {source} else: author2sources[author].add(source) if source is not None: if source not in source2refs: source2refs[source] = {ref_id} else: source2refs[source].add(ref_id) if source not in source2authors: source2authors[source] = set(node["authors"]) else: source2authors[source].update(node["authors"]) authors = [] sources = [] for author, references in author2refs.iteritems(): authors.append({ "name": author, "referenceCount": len(references), "references": list(references), "authors": list(author2sources.get(author, [])) }) for source, references in source2refs.iteritems(): sources.append({ "name": source, "referenceCount": len(references), "references": list(references), "authors": list(source2authors.get(source, [])) }) edges = list(edges) html = fetcher.fetch(nodes[central_ref_id]["url"]) paragraphs = textutil.get_pretty_markup(html) paragraphs = [p.encode("utf-8") for p in paragraphs] bodies = [(node["body"].encode("utf-8"), node["refId"]) for node in nodes.itervalues() if node["refId"] != central_ref_id] # text_markup = [textutil.generate_markup(p_text, bodies) for p_text in paragraphs] text = nodes[central_ref_id]["text"].encode("utf-8") title = nodes[central_ref_id]["title"].encode("utf-8") markup = textutil.generate_markup(title, text, paragraphs, bodies) story_graph = { "edges": edges, "nodes": nodes, "meta": { "centralNode": central_ref_id, "citations": [], "markup": markup.json(), "authors": authors, "sources": sources, } } story_fp = os.path.join(story_dir, "%d.json" % i) with open(story_fp, "wb") as o_fl: json_dump(story_graph, o_fl)
def step_6_filter_out_unrelated(args): """ Filter not related documents found by Google. """ origin_gse_dir = os.path.join(args.work_dir, ORIGIN_GSE_DIR) related_links_dir = os.path.join(args.work_dir, RELATED_LINKS_DIR) origins = read_origins(args) black_domains = Blacklist.load(Blacklist.BLACK_DOM) fetcher = PageFetcher() text_util = TextUtil() clean_directory(related_links_dir) for i, url in enumerate(origins): i_gse_fp = os.path.join(origin_gse_dir, "%d.json" % i) o_link_fp = os.path.join(related_links_dir, "%d.json" % i) related_url2gse = {} # Google search annotation for each URL related_url2html = {} # HTML for related URL related_url2segm = {} # all "linked" sentences for related URL uniq_segments = set() with open(i_gse_fp, "rb") as i_fl: gse = json.load(i_fl) for segment_entry in gse: segment_text = text_util.simplified_text(segment_entry["segment"]) uniq_segments.add(segment_text) for gse_found_item in segment_entry["foundItems"]: item_url = gse_found_item["link"] if item_url in black_domains: # logging.warn("Blacklisted related url %s" % item_url) continue if item_url not in related_url2gse: related_url2gse[item_url] = gse_found_item if item_url not in related_url2segm: related_url2segm[item_url] = {segment_text} else: related_url2segm[item_url].add(segment_text) if item_url not in related_url2html: related_url2html[item_url] = None fetcher.fetch_urls(related_url2html, max_threads=args.max_threads) # Now, filter not related # 1. Put all fetched urls into related set related_urls = set(related_url2html.iterkeys()) filtered_urls = [] fuzzy_patterns = text_util.compile_fuzzy_patterns(uniq_segments) for j, rel_url in enumerate(related_urls): if j % 10 == 0: logging.info("Fuzzy matching (%d) %d/%d" % (i, j, len(related_urls))) html = related_url2html[rel_url] body, _ = text_util.extract_body(rel_url, html) body = text_util.simplified_text(body) segments = related_url2segm[rel_url] best_ratio = 0.0 matches = [] for segment in segments: fuzzy_pattern = fuzzy_patterns[segment] ratio, match = text_util.fuzzy_search(body, segment, fuzzy_pattern) matches.append({ "match": match, "ratio": ratio, }) if ratio > best_ratio: best_ratio = ratio gse_data = related_url2gse[rel_url] filtered_urls.append({ "url": rel_url, "segments": list(segments), "body": body, "bestRatio": best_ratio, "foundMatches": matches, "highRatio": best_ratio > 0.5, "gseData": gse_data, "html": html, }) with open(o_link_fp, "wb") as o_fl: json_dump(filtered_urls, o_fl)
class ReferenceIndex(object): """ Index """ def __init__(self, entries): self.id2entry = {} self.text_index = {} self.text_util = TextUtil() for entry in entries: self.id2entry[entry.ref_id] = entry def iterentries(self): return self.id2entry.itervalues() def index(self): for entry in self.id2entry.itervalues(): sentences = self.text_util.sent_tokenize( entry.text) + [entry.title] quotes = self.text_util.extract_quoted(entry.text) sentences = self.text_util.select_segments(sentences, quotes, min_size=5) sentences = sanitize_sentences(sentences) for sent in sentences: if sent in self.text_index: self.text_index[sent].add(entry.ref_id) else: self.text_index[sent] = {entry.ref_id} def find_text_references(self, entry): sentences = self.text_util.sent_tokenize(entry.text) + [entry.title] quotes = self.text_util.extract_quoted(entry.text) sentences = self.text_util.select_segments(sentences, quotes, min_size=5) sentences = sanitize_sentences(sentences) sent_refs = {} for sent in sentences: found = self.text_index.get(sent) for ref_id in found: if ref_id == entry.ref_id: continue if sent in sent_refs: sent_refs[sent].add(ref_id) else: sent_refs[sent] = {ref_id} if len(sent_refs) == 0: return [] uniq_refs = set() for ref_id_set in sent_refs.itervalues(): uniq_refs.update(ref_id_set) ref_pairs = [(ref_id, entry.ref_id) for ref_id in uniq_refs] return ref_pairs def extract_query_sentence(self, entry, trim=32): sentences = self.text_util.sent_tokenize(entry.text) + [entry.title] quotes = self.text_util.extract_quoted(entry.text) sentences = self.text_util.select_segments(sentences, quotes, min_size=5) sentences = [self.text_util.simplified_text(s) for s in sentences] if trim is not None and trim > 0: for i in xrange(len(sentences)): sentences[i] = " ".join(sentences[i].split()[:trim]) return sentences def extract_references(self, entries): found_links = set() for entry in entries: entry_refs = self.find_text_references(entry) found_links.update(entry_refs) return found_links def fuzzy_extract_references(self, entries): entries = list(entries) sentence2entry = {} # Extract sentences logging.info("Extracting sentences.") for entry in entries: q_sentences = self.extract_query_sentence(entry, trim=32) for sent in q_sentences: if sent not in sentence2entry: sentence2entry[sent] = {entry.ref_id} else: sentence2entry[sent].add(entry.ref_id) # Compile regexes logging.info("Compiling regexes.") sentence2regex = {} for sent in sentence2entry.iterkeys(): if sent not in sentence2regex: regex = self.text_util.compile_fuzzy_pattern(sent) sentence2regex[sent] = regex logging.info("Compiled %d." % len(sentence2regex)) # Find text matches logging.info("Fuzzy matching.") entry2matches = {} for i, entry in enumerate(entries): entry2matches[entry.ref_id] = set() for sent, regex in sentence2regex.iteritems(): if self.text_util.ffs(entry.body, sent, regex): sentence_entries = sentence2entry[sent] for ref_id in sentence_entries: if ref_id == entry.ref_id: continue entry2matches[entry.ref_id].add(ref_id) logging.info("Done %d/%d." % (len(entries), i + 1)) found_pairs = [] for entry, matches in entry2matches.iteritems(): for m in matches: found_pairs.append((entry, m)) return found_pairs def find_cross_references(self, sent_window_size=3): found_links = self.fuzzy_extract_references(self.iterentries()) return found_links def print_titles(self): for entry in self.id2entry.itervalues(): print entry.ref_id, "\t * %s" % entry.title def __repr__(self): return "<RefIndex(entries=%d)>" % len(self.id2entry)
class CseAPI(object): BASE_URL = "https://www.googleapis.com/customsearch/v1?" def __init__(self, key, engine_id): if key is None: raise CseError("API key cannot be None") self.key = key self.engine_id = engine_id self.config = None self.text_util = TextUtil() @staticmethod def from_config(api_config): api = CseAPI(key=api_config["googleApiKey"], engine_id=api_config["googleEngineId"]) api.config = GoogleApiConfig(api_config) return api def make_query( self, query_string=None, country=None, date_start=None, date_end=None, exact_terms=None, ): q = { "q": query_string, "cr": country, # Country restrict(s) "dateRestrict": None, # Specifies all search results are from a time period (string) "exactTerms": exact_terms, # Identifies a phrase that all documents in the search results must # contain (string) "excludeTerms": None, # Identifies a word or phrase that should not appear in any documents # in the search results (string) "fileType": None, # Returns images of a specified type. Some of the allowed values are: # bmp, gif, png, jpg, svg, pdf, ... (string) "num": None, # Number of search results to return (integer), "orTerms": None, # Provides additional search terms to check for in a document, where each # document in the search results must contain at least one of the additional # search terms (string) "relatedSite": None, # Specifies that all search results should be pages that are related to the # specified URL (string) "sort": None, # The sort expression to apply to the results (string) "start": None, # The index of the first result to return (integer) } for key, value in q.items(): if value is None: del q[key] return q def make_query_pages(self, results_number=10, start=1): start = 1 page_size = 10.0 pages = math.ceil(float(results_number) / page_size) for page in xrange(int(pages)): yield int(page * page_size + 1) def make_cse_url(self, query): params = urllib.urlencode(query) return self.BASE_URL + params def find_results(self, query, max_results=1000, upper_threshold=1000, bottom_threshold=100, query_size_heuristic=10): result_urls = [] found_results = [] total_results = None query_size = self.text_util.words_count(query["exactTerms"]) for start_index in self.make_query_pages(max_results): if total_results is not None and start_index > total_results: break query["start"] = start_index url = self.make_cse_url(query) + "&cx=%s&key=%s" % (self.engine_id, self.key) logging.debug(url) result_urls.append(url) try: api_response = requests.get(url).json() except Exception: logging.error("Error while calling '%s':%s" % (url, traceback.format_exc())) api_response = None if api_response is None: logging.warn("API response is None") break if "searchInformation" not in api_response: logging.warn("Search information not found") if total_results is None: break if total_results is None: total_results = int( api_response["searchInformation"]["totalResults"]) if total_results > upper_threshold: logging.warn("Total results it too large %d" % total_results) break if total_results > bottom_threshold and query_size < query_size_heuristic: logging.warn("Too many results for too short query %d" % total_results) break if "items" not in api_response or len(api_response["items"]) == 0: if total_results is None: logging.warn("Items not found. Stop.") break else: logging.warn("Item not found. Skip.") continue found_results.extend(api_response["items"]) return found_results, result_urls, 0 if total_results is None else total_results def __repr__(self): return "<CseAPI(key='%s', engine='%s')>" % (self.key[:8] + "..", self.engine_id[:8] + "..")
class ReferenceIndex(object): """ Index """ def __init__(self, entries): self.id2entry = {} self.text_index = {} self.text_util = TextUtil() for entry in entries: self.id2entry[entry.ref_id] = entry def iterentries(self): return self.id2entry.itervalues() def index(self): for entry in self.id2entry.itervalues(): sentences = self.text_util.sent_tokenize(entry.text) + [entry.title] quotes = self.text_util.extract_quoted(entry.text) sentences = self.text_util.select_segments(sentences, quotes, min_size=5) sentences = sanitize_sentences(sentences) for sent in sentences: if sent in self.text_index: self.text_index[sent].add(entry.ref_id) else: self.text_index[sent] = {entry.ref_id} def find_text_references(self, entry): sentences = self.text_util.sent_tokenize(entry.text) + [entry.title] quotes = self.text_util.extract_quoted(entry.text) sentences = self.text_util.select_segments(sentences, quotes, min_size=5) sentences = sanitize_sentences(sentences) sent_refs = {} for sent in sentences: found = self.text_index.get(sent) for ref_id in found: if ref_id == entry.ref_id: continue if sent in sent_refs: sent_refs[sent].add(ref_id) else: sent_refs[sent] = {ref_id} if len(sent_refs) == 0: return [] uniq_refs = set() for ref_id_set in sent_refs.itervalues(): uniq_refs.update(ref_id_set) ref_pairs = [(ref_id, entry.ref_id) for ref_id in uniq_refs] return ref_pairs def extract_query_sentence(self, entry, trim=32): sentences = self.text_util.sent_tokenize(entry.text) + [entry.title] quotes = self.text_util.extract_quoted(entry.text) sentences = self.text_util.select_segments(sentences, quotes, min_size=5) sentences = [self.text_util.simplified_text(s) for s in sentences] if trim is not None and trim > 0: for i in xrange(len(sentences)): sentences[i] = " ".join(sentences[i].split()[:trim]) return sentences def extract_references(self, entries): found_links = set() for entry in entries: entry_refs = self.find_text_references(entry) found_links.update(entry_refs) return found_links def fuzzy_extract_references(self, entries): entries = list(entries) sentence2entry = {} # Extract sentences logging.info("Extracting sentences.") for entry in entries: q_sentences = self.extract_query_sentence(entry, trim=32) for sent in q_sentences: if sent not in sentence2entry: sentence2entry[sent] = {entry.ref_id} else: sentence2entry[sent].add(entry.ref_id) # Compile regexes logging.info("Compiling regexes.") sentence2regex = {} for sent in sentence2entry.iterkeys(): if sent not in sentence2regex: regex = self.text_util.compile_fuzzy_pattern(sent) sentence2regex[sent] = regex logging.info("Compiled %d." % len(sentence2regex)) # Find text matches logging.info("Fuzzy matching.") entry2matches = {} for i, entry in enumerate(entries): entry2matches[entry.ref_id] = set() for sent, regex in sentence2regex.iteritems(): if self.text_util.ffs(entry.body, sent, regex): sentence_entries = sentence2entry[sent] for ref_id in sentence_entries: if ref_id == entry.ref_id: continue entry2matches[entry.ref_id].add(ref_id) logging.info("Done %d/%d." % (len(entries), i + 1)) found_pairs = [] for entry, matches in entry2matches.iteritems(): for m in matches: found_pairs.append((entry, m)) return found_pairs def find_cross_references(self, sent_window_size=3): found_links = self.fuzzy_extract_references(self.iterentries()) return found_links def print_titles(self): for entry in self.id2entry.itervalues(): print entry.ref_id, "\t * %s" % entry.title def __repr__(self): return "<RefIndex(entries=%d)>" % len(self.id2entry)