コード例 #1
0
ファイル: google.py プロジェクト: fullbright/nlcd
 def __init__(self, key, engine_id):
     if key is None:
         raise CseError("API key cannot be None")
     self.key = key
     self.engine_id = engine_id
     self.config = None
     self.text_util = TextUtil()
コード例 #2
0
ファイル: pipeline.py プロジェクト: fullbright/nlcd
def step_3_extract_origin_bodies(args):
    """
    Read origin articles from work directory and extract clean text from them.
    """
    origin_html_dir = os.path.join(args.work_dir, ORIGIN_HTML_DIR)
    origin_body_dir = os.path.join(args.work_dir, ORIGIN_BODY_DIR)

    text_util = TextUtil()
    origins = read_origins(args)

    clean_directory(origin_body_dir)

    for i, url in enumerate(origins):

        i_html_fp = os.path.join(origin_html_dir, "%d.html" % i)
        o_body_fp = os.path.join(origin_body_dir, "%d.json" % i)

        with open(i_html_fp, "rb") as i_fl:

            html = i_fl.read()
            body, lang_id = text_util.extract_body(url, html)

            with open(o_body_fp, "wb") as o_fl:
                json_dump({
                    "body": body,
                    "lang_id": lang_id,
                }, o_fl)

    logging.info("Extracted %d bodies." % len(origins))
コード例 #3
0
ファイル: rd.py プロジェクト: fullbright/nlcd
    def __init__(self, entries):

        self.id2entry = {}
        self.text_index = {}

        self.text_util = TextUtil()

        for entry in entries:
            self.id2entry[entry.ref_id] = entry
コード例 #4
0
ファイル: eval.py プロジェクト: fullbright/nlcd
def compute_title_prf(eval_data):

    text_util = TextUtil()

    gold_data = [entry[0] for entry in eval_data]

    gold_eval_out = [None] * len(eval_data)
    eval_out = []

    for method_id in xrange(1, len(eval_data[0])):

        method_data = [entry[method_id] for entry in eval_data]
        method_eval_out = []

        found_n = 0
        correct_n = 0
        data_size = 0

        for i in xrange(len(eval_data)):

            gold = gold_data[i]
            pred = method_data[i]

            gold = text_util.simplified_text(
                gold) if gold != "<NONE>" and gold != "" else None
            pred = text_util.simplified_text(
                pred) if pred != "<NONE>" and pred != "" else None

            if gold is not None:
                data_size += 1

            if pred is not None:
                found_n += 1

            correct = compare_titles(pred, gold, 3, True) or compare_titles(
                pred, gold, 3, False)

            if correct:
                correct_n += 1

            gold_eval_out[i] = gold
            method_eval_out.append((pred, str(int(not correct))))

        p = 0 if found_n == 0 else float(correct_n) / float(found_n)
        r = 0 if data_size == 0 else float(correct_n) / float(data_size)
        f = 0 if p + r == 0 else p * r / (p + r) * 2

        prf = (p, r, f)

        eval_out.append((prf, method_eval_out))

    return gold_eval_out, eval_out
コード例 #5
0
ファイル: eval.py プロジェクト: LucidAi/nlcd
def compute_title_prf(eval_data):

    text_util = TextUtil()

    gold_data = [entry[0] for entry in eval_data]

    gold_eval_out = [None] * len(eval_data)
    eval_out = []

    for method_id in xrange(1, len(eval_data[0])):

        method_data = [entry[method_id] for entry in eval_data]
        method_eval_out = []

        found_n = 0
        correct_n = 0
        data_size = 0

        for i in xrange(len(eval_data)):

            gold = gold_data[i]
            pred = method_data[i]

            gold = text_util.simplified_text(gold) if gold != "<NONE>" and gold != "" else None
            pred = text_util.simplified_text(pred) if pred != "<NONE>" and pred != "" else None

            if gold is not None:
                data_size += 1

            if pred is not None:
                found_n += 1

            correct = compare_titles(pred, gold, 3, True) or compare_titles(pred, gold, 3, False)

            if correct:
                correct_n += 1

            gold_eval_out[i] = gold
            method_eval_out.append((pred, str(int(not correct))))

        p = 0 if found_n == 0 else float(correct_n) / float(found_n)
        r = 0 if data_size == 0 else float(correct_n) / float(data_size)
        f = 0 if p + r == 0 else p * r / (p + r) * 2

        prf = (p, r, f)

        eval_out.append((prf, method_eval_out))

    return gold_eval_out, eval_out
コード例 #6
0
ファイル: google.py プロジェクト: LucidAi/nlcd
 def __init__(self, key, engine_id):
     if key is None:
         raise CseError("API key cannot be None")
     self.key = key
     self.engine_id = engine_id
     self.config = None
     self.text_util = TextUtil()
コード例 #7
0
ファイル: rd.py プロジェクト: LucidAi/nlcd
    def __init__(self, entries):

        self.id2entry = {}
        self.text_index = {}

        self.text_util = TextUtil()

        for entry in entries:
            self.id2entry[entry.ref_id] = entry
コード例 #8
0
ファイル: pipeline.py プロジェクト: fullbright/nlcd
def step_4_extract_sentences(args):
    """
    Extract sentences and quotes and segment them.
    """
    origin_body_dir = os.path.join(args.work_dir, ORIGIN_BODY_DIR)
    origin_sentence_dir = os.path.join(args.work_dir, ORIGIN_SEGMENT_DIR)
    origins = read_origins(args)

    text_util = TextUtil()

    clean_directory(origin_sentence_dir)

    for i, url in enumerate(origins):

        i_body_fp = os.path.join(origin_body_dir, "%d.json" % i)
        o_segment_fp = os.path.join(origin_sentence_dir, "%d.json" % i)

        with open(i_body_fp, "rb") as i_fl:

            body_obj = json.load(i_fl)

            with open(o_segment_fp, "wb") as o_fl:

                body = body_obj["body"]
                lang_id = body_obj["lang_id"].encode("utf-8")

                sentences = text_util.sent_tokenize(body)
                quoted = text_util.extract_quoted(body)
                segments = text_util.select_segments(sentences, quoted)
                
                #########################################################
                #TODO: Cut long segments. Move this to function or method

                for i in xrange(len(segments)):

                    segment = segments[i]
                    segment = text_util.simplified_text(segment, remove_punct=False)
                    words = segment.split()

                    if len(words) > 32:
                        words = words[:32]

                    segments[i] = " ".join(words)
                
                ########################################################

                json_dump({
                    "url": url,
                    "text": body,
                    "lang_id": lang_id,
                    "sentences": sentences,
                    "quoted": quoted,
                    "segments": segments,
                }, o_fl)

                logging.info(("Extracted:    %02d sent    %02d quot    %02d segm." % (
                    len(sentences),
                    len(quoted),
                    len(segments)
                )).encode("utf-8"))
コード例 #9
0
ファイル: google.py プロジェクト: LucidAi/nlcd
class CseAPI(object):
    BASE_URL = "https://www.googleapis.com/customsearch/v1?"

    def __init__(self, key, engine_id):
        if key is None:
            raise CseError("API key cannot be None")
        self.key = key
        self.engine_id = engine_id
        self.config = None
        self.text_util = TextUtil()

    @staticmethod
    def from_config(api_config):
        api = CseAPI(key=api_config["googleApiKey"], engine_id=api_config["googleEngineId"])
        api.config = GoogleApiConfig(api_config)
        return api

    def make_query(self,
                     query_string=None,
                     country=None,
                     date_start=None,
                     date_end=None,
                     exact_terms=None,
                     ):
        q = {
            "q": query_string,

            "cr": country,                  # Country restrict(s)
            "dateRestrict": None,           # Specifies all search results are from a time period (string)
            "exactTerms": exact_terms,      # Identifies a phrase that all documents in the search results must
                                            # contain (string)
            "excludeTerms": None,           # Identifies a word or phrase that should not appear in any documents
                                            # in the search results (string)
            "fileType": None,               # Returns images of a specified type. Some of the allowed values are:
                                            # bmp, gif, png, jpg, svg, pdf, ... (string)
            "num": None,                    # Number of search results to return (integer),
            "orTerms": None,                # Provides additional search terms to check for in a document, where each
                                            # document in the search results must contain at least one of the additional
                                            # search terms (string)
            "relatedSite": None,            # Specifies that all search results should be pages that are related to the
                                            # specified URL (string)
            "sort": None,                   # The sort expression to apply to the results (string)
            "start": None,                  # The index of the first result to return (integer)

        }
        for key, value in q.items():
            if value is None:
                del q[key]
        return q

    def make_query_pages(self, results_number=10, start=1):
        start = 1
        page_size = 10.0
        pages = math.ceil(float(results_number) / page_size)
        for page in xrange(int(pages)):
            yield int(page * page_size + 1)

    def make_cse_url(self, query):
        params = urllib.urlencode(query)
        return self.BASE_URL + params

    def find_results(self, query,
                     max_results=1000,
                     upper_threshold=1000,
                     bottom_threshold=100,
                     query_size_heuristic=10):

        result_urls = []
        found_results = []
        total_results = None

        query_size = self.text_util.words_count(query["exactTerms"])

        for start_index in self.make_query_pages(max_results):

            if total_results is not None and start_index > total_results:
                break

            query["start"] = start_index

            url = self.make_cse_url(query) + "&cx=%s&key=%s" % (self.engine_id, self.key)

            logging.debug(url)

            result_urls.append(url)

            try:
                api_response = requests.get(url).json()
            except Exception:
                logging.error("Error while calling '%s':%s" % (url,  traceback.format_exc()))
                api_response = None

            if api_response is None:
                logging.warn("API response is None")
                break

            if "searchInformation" not in api_response:
                logging.warn("Search information not found")
                if total_results is None:
                    break

            if total_results is None:
                total_results = int(api_response["searchInformation"]["totalResults"])

            if total_results > upper_threshold:
                logging.warn("Total results it too large %d" % total_results)
                break

            if total_results > bottom_threshold and query_size < query_size_heuristic:
                logging.warn("Too many results for too short query %d" % total_results)
                break

            if "items" not in api_response or len(api_response["items"]) == 0:
                if total_results is None:
                    logging.warn("Items not found. Stop.")
                    break
                else:
                    logging.warn("Item not found. Skip.")
                    continue

            found_results.extend(api_response["items"])

        return found_results, result_urls, 0 if total_results is None else total_results

    def __repr__(self):
        return "<CseAPI(key='%s', engine='%s')>" % (self.key[:8]+"..", self.engine_id[:8]+"..")
コード例 #10
0
ファイル: pipeline.py プロジェクト: fullbright/nlcd
def step_9_enrich_story_graphs(args):

    origins = read_origins(args)

    graph_dir = os.path.join(args.work_dir, CROSSREF_OUT_DATA_DIR)
    story_dir = os.path.join(args.work_dir, STORY_GRAPH_DIR)

    clean_directory(story_dir)

    fetcher = PageFetcher()
    textutil = TextUtil()

    for i, origin_url in enumerate(origins):

        logging.info("Processing graph #%d" % i)

        graph_fp = os.path.join(graph_dir, "%d.json" % i)
        story_fp = os.path.join(story_dir, "%d.json" % i)

        with open(graph_fp, "rb") as graph_fl:
            graph = json.load(graph_fl)

        # Pre-process edges
        edges = set()
        for a, b in graph["edges"]:
            edges.add((min(a, b), max(a, b)))
        ref_counter = collections.Counter()
        for a, b in edges:
            ref_counter[a] += 1
            ref_counter[b] += 1

        # Find central reference, also index by url
        central_ref_id = None

        url_index = {}
        for node in graph["nodes"].itervalues():
            node_url = node["url"]
            url_index[node_url] = node
            if node_url == origin_url:
                central_ref_id = node["refId"]

        if central_ref_id is None:
            logging.warn("Central reference not found. Selecting the most referenced.")
            central_ref_id = ref_counter.most_common(1)[0][0]
            logging.info(central_ref_id)

        urls = [node["url"] for node in graph["nodes"].itervalues()]

        # Download FB reactions

        fb_counts = fetcher.get_facebook_counts(urls)
        logging.info("Fetched %d FB reactions" % len([fc for fc in fb_counts if fc is not None]))
        fb_comments = {}
        fb_shares = {}

        for fb_entry in (fc for fc in fb_counts if fc is not None):
            n_comments = fb_entry.get("comments", 0)
            n_shares = fb_entry.get("shares", 0)

            try:
                ref_id = url_index[fb_entry.get("id")]["refId"]
            except KeyError:
                try:
                    ref_id = url_index[fb_entry.get("url")]["refId"]
                except KeyError:
                    continue

            fb_comments[ref_id] = n_comments
            fb_shares[ref_id] = n_shares

        # Download TW reactions
        tw_counts = fetcher.get_twitter_counts(urls)
        logging.info("Fetched %d TW reactions" % len([te for te in tw_counts if te is not None]))
        tw_shares = {}
        for tw_entry in (te for te in tw_counts if te is not None):
            n_shares = tw_entry.get("count", 0)
            tw_url = tw_entry["url"]
            node = url_index.get(tw_url)
            if node is None:
                node = url_index.get(tw_url[:-1])
                if node is None:
                    continue
            ref_id = node["refId"]
            tw_shares[ref_id] = n_shares

        # Generate date distribution

        # Find additional connection features

        nodes = {}

        author2refs = {}
        author2sources = {}

        source2refs = {}
        source2authors = {}

        for node in graph["nodes"].itervalues():

            ref_id = node["refId"]

            node["twitterShares"] = tw_shares.get(ref_id, 0)
            node["facebookShares"] = fb_shares.get(ref_id, 0)
            node["facebookComments"] = fb_comments.get(ref_id, 0)
            node["referenceCount"] = ref_counter.get(ref_id, 0)
            node["citationCount"] = 0
            node["shareCount"] = node["twitterShares"] + node["facebookShares"]

            nodes[node["refId"]] = node

            node["authors"] = sorted(node["authors"])
            node["sources"] = list(reversed(sorted(node["sources"])))

            source = None
            if len(node["sources"]) > 0:
                source = node["sources"][0]

            for author in node["authors"]:
                if author not in author2refs:
                    author2refs[author] = {ref_id}
                else:
                    author2refs[author].add(ref_id)
                if source is not None:
                    if author not in author2sources:
                        author2sources[author] = {source}
                    else:
                        author2sources[author].add(source)

            if source is not None:

                if source not in source2refs:
                    source2refs[source] = {ref_id}
                else:
                    source2refs[source].add(ref_id)

                if source not in source2authors:
                    source2authors[source] = set(node["authors"])
                else:
                    source2authors[source].update(node["authors"])

        authors = []
        sources = []

        for author, references in author2refs.iteritems():
            authors.append({
                "name": author,
                "referenceCount": len(references),
                "references": list(references),
                "authors": list(author2sources.get(author, []))
            })

        for source, references in source2refs.iteritems():
            sources.append({
                "name": source,
                "referenceCount": len(references),
                "references": list(references),
                "authors": list(source2authors.get(source, []))
            })

        edges = list(edges)

        html = fetcher.fetch(nodes[central_ref_id]["url"])
        paragraphs = textutil.get_pretty_markup(html)
        paragraphs = [p.encode("utf-8") for p in paragraphs]

        bodies = [(node["body"].encode("utf-8"), node["refId"])
                   for node in nodes.itervalues()
                   if node["refId"] != central_ref_id]

        # text_markup = [textutil.generate_markup(p_text, bodies)  for p_text in paragraphs]
        text = nodes[central_ref_id]["text"].encode("utf-8")
        title = nodes[central_ref_id]["title"].encode("utf-8")
        markup = textutil.generate_markup(title, text, paragraphs, bodies)

        story_graph = {

            "edges": edges,
            "nodes": nodes,
            "meta": {
                "centralNode": central_ref_id,
                "citations": [],
                "markup": markup.json(),
                "authors": authors,
                "sources": sources,
            }

        }

        story_fp = os.path.join(story_dir, "%d.json" % i)

        with open(story_fp, "wb") as o_fl:
            json_dump(story_graph, o_fl)
コード例 #11
0
ファイル: pipeline.py プロジェクト: fullbright/nlcd
def step_6_filter_out_unrelated(args):
    """
    Filter not related documents found by Google.
    """

    origin_gse_dir = os.path.join(args.work_dir, ORIGIN_GSE_DIR)
    related_links_dir = os.path.join(args.work_dir, RELATED_LINKS_DIR)
    origins = read_origins(args)
    black_domains = Blacklist.load(Blacklist.BLACK_DOM)
    fetcher = PageFetcher()
    text_util = TextUtil()

    clean_directory(related_links_dir)

    for i, url in enumerate(origins):

        i_gse_fp = os.path.join(origin_gse_dir, "%d.json" % i)
        o_link_fp = os.path.join(related_links_dir, "%d.json" % i)

        related_url2gse = {}    # Google search annotation for each URL
        related_url2html = {}   # HTML for related URL
        related_url2segm = {}   # all "linked" sentences for related URL

        uniq_segments = set()

        with open(i_gse_fp, "rb") as i_fl:
            gse = json.load(i_fl)

        for segment_entry in gse:

            segment_text = text_util.simplified_text(segment_entry["segment"])
            
            uniq_segments.add(segment_text)

            for gse_found_item in segment_entry["foundItems"]:

                item_url = gse_found_item["link"]

                if item_url in black_domains:
                    # logging.warn("Blacklisted related url %s" % item_url)
                    continue

                if item_url not in related_url2gse:
                    related_url2gse[item_url] = gse_found_item

                if item_url not in related_url2segm:
                    related_url2segm[item_url] = {segment_text}
                else:
                    related_url2segm[item_url].add(segment_text)

                if item_url not in related_url2html:
                    related_url2html[item_url] = None
        
        fetcher.fetch_urls(related_url2html, max_threads=args.max_threads)

        # Now, filter not related

        # 1. Put all fetched urls into related set
        related_urls = set(related_url2html.iterkeys())

        filtered_urls = []

        fuzzy_patterns = text_util.compile_fuzzy_patterns(uniq_segments)

        for j, rel_url in enumerate(related_urls):

            if j % 10 == 0:
                logging.info("Fuzzy matching (%d) %d/%d" % (i, j, len(related_urls)))

            html = related_url2html[rel_url]
            body, _ = text_util.extract_body(rel_url, html)
            body = text_util.simplified_text(body)
            segments = related_url2segm[rel_url]

            best_ratio = 0.0
            matches = []

            for segment in segments:
                
                fuzzy_pattern = fuzzy_patterns[segment]
                ratio, match = text_util.fuzzy_search(body, segment, fuzzy_pattern)

                matches.append({
                    "match": match,
                    "ratio": ratio,
                })

                if ratio > best_ratio:
                    best_ratio = ratio

            gse_data = related_url2gse[rel_url]

            filtered_urls.append({
                "url": rel_url,
                "segments": list(segments),
                "body": body,
                "bestRatio": best_ratio,
                "foundMatches": matches,
                "highRatio": best_ratio > 0.5,
                "gseData": gse_data,
                "html": html,
            })

        with open(o_link_fp, "wb") as o_fl:
            json_dump(filtered_urls, o_fl)
コード例 #12
0
ファイル: rd.py プロジェクト: fullbright/nlcd
class ReferenceIndex(object):
    """
    Index
    """
    def __init__(self, entries):

        self.id2entry = {}
        self.text_index = {}

        self.text_util = TextUtil()

        for entry in entries:
            self.id2entry[entry.ref_id] = entry

    def iterentries(self):
        return self.id2entry.itervalues()

    def index(self):

        for entry in self.id2entry.itervalues():

            sentences = self.text_util.sent_tokenize(
                entry.text) + [entry.title]
            quotes = self.text_util.extract_quoted(entry.text)
            sentences = self.text_util.select_segments(sentences,
                                                       quotes,
                                                       min_size=5)
            sentences = sanitize_sentences(sentences)

            for sent in sentences:
                if sent in self.text_index:
                    self.text_index[sent].add(entry.ref_id)
                else:
                    self.text_index[sent] = {entry.ref_id}

    def find_text_references(self, entry):

        sentences = self.text_util.sent_tokenize(entry.text) + [entry.title]
        quotes = self.text_util.extract_quoted(entry.text)
        sentences = self.text_util.select_segments(sentences,
                                                   quotes,
                                                   min_size=5)
        sentences = sanitize_sentences(sentences)

        sent_refs = {}

        for sent in sentences:

            found = self.text_index.get(sent)

            for ref_id in found:
                if ref_id == entry.ref_id:
                    continue
                if sent in sent_refs:
                    sent_refs[sent].add(ref_id)
                else:
                    sent_refs[sent] = {ref_id}

        if len(sent_refs) == 0:
            return []

        uniq_refs = set()
        for ref_id_set in sent_refs.itervalues():
            uniq_refs.update(ref_id_set)

        ref_pairs = [(ref_id, entry.ref_id) for ref_id in uniq_refs]

        return ref_pairs

    def extract_query_sentence(self, entry, trim=32):
        sentences = self.text_util.sent_tokenize(entry.text) + [entry.title]
        quotes = self.text_util.extract_quoted(entry.text)
        sentences = self.text_util.select_segments(sentences,
                                                   quotes,
                                                   min_size=5)
        sentences = [self.text_util.simplified_text(s) for s in sentences]
        if trim is not None and trim > 0:
            for i in xrange(len(sentences)):
                sentences[i] = " ".join(sentences[i].split()[:trim])
        return sentences

    def extract_references(self, entries):
        found_links = set()
        for entry in entries:
            entry_refs = self.find_text_references(entry)
            found_links.update(entry_refs)
        return found_links

    def fuzzy_extract_references(self, entries):

        entries = list(entries)
        sentence2entry = {}

        # Extract sentences
        logging.info("Extracting sentences.")
        for entry in entries:

            q_sentences = self.extract_query_sentence(entry, trim=32)

            for sent in q_sentences:

                if sent not in sentence2entry:
                    sentence2entry[sent] = {entry.ref_id}
                else:
                    sentence2entry[sent].add(entry.ref_id)

        # Compile regexes
        logging.info("Compiling regexes.")
        sentence2regex = {}
        for sent in sentence2entry.iterkeys():

            if sent not in sentence2regex:
                regex = self.text_util.compile_fuzzy_pattern(sent)
                sentence2regex[sent] = regex
        logging.info("Compiled %d." % len(sentence2regex))

        # Find text matches
        logging.info("Fuzzy matching.")

        entry2matches = {}

        for i, entry in enumerate(entries):

            entry2matches[entry.ref_id] = set()

            for sent, regex in sentence2regex.iteritems():

                if self.text_util.ffs(entry.body, sent, regex):

                    sentence_entries = sentence2entry[sent]

                    for ref_id in sentence_entries:

                        if ref_id == entry.ref_id:
                            continue

                        entry2matches[entry.ref_id].add(ref_id)

            logging.info("Done %d/%d." % (len(entries), i + 1))

        found_pairs = []
        for entry, matches in entry2matches.iteritems():
            for m in matches:
                found_pairs.append((entry, m))

        return found_pairs

    def find_cross_references(self, sent_window_size=3):

        found_links = self.fuzzy_extract_references(self.iterentries())

        return found_links

    def print_titles(self):
        for entry in self.id2entry.itervalues():
            print entry.ref_id, "\t * %s" % entry.title

    def __repr__(self):

        return "<RefIndex(entries=%d)>" % len(self.id2entry)
コード例 #13
0
ファイル: google.py プロジェクト: fullbright/nlcd
class CseAPI(object):
    BASE_URL = "https://www.googleapis.com/customsearch/v1?"

    def __init__(self, key, engine_id):
        if key is None:
            raise CseError("API key cannot be None")
        self.key = key
        self.engine_id = engine_id
        self.config = None
        self.text_util = TextUtil()

    @staticmethod
    def from_config(api_config):
        api = CseAPI(key=api_config["googleApiKey"],
                     engine_id=api_config["googleEngineId"])
        api.config = GoogleApiConfig(api_config)
        return api

    def make_query(
        self,
        query_string=None,
        country=None,
        date_start=None,
        date_end=None,
        exact_terms=None,
    ):
        q = {
            "q": query_string,
            "cr": country,  # Country restrict(s)
            "dateRestrict":
            None,  # Specifies all search results are from a time period (string)
            "exactTerms":
            exact_terms,  # Identifies a phrase that all documents in the search results must
            # contain (string)
            "excludeTerms":
            None,  # Identifies a word or phrase that should not appear in any documents
            # in the search results (string)
            "fileType":
            None,  # Returns images of a specified type. Some of the allowed values are:
            # bmp, gif, png, jpg, svg, pdf, ... (string)
            "num": None,  # Number of search results to return (integer),
            "orTerms":
            None,  # Provides additional search terms to check for in a document, where each
            # document in the search results must contain at least one of the additional
            # search terms (string)
            "relatedSite":
            None,  # Specifies that all search results should be pages that are related to the
            # specified URL (string)
            "sort":
            None,  # The sort expression to apply to the results (string)
            "start": None,  # The index of the first result to return (integer)
        }
        for key, value in q.items():
            if value is None:
                del q[key]
        return q

    def make_query_pages(self, results_number=10, start=1):
        start = 1
        page_size = 10.0
        pages = math.ceil(float(results_number) / page_size)
        for page in xrange(int(pages)):
            yield int(page * page_size + 1)

    def make_cse_url(self, query):
        params = urllib.urlencode(query)
        return self.BASE_URL + params

    def find_results(self,
                     query,
                     max_results=1000,
                     upper_threshold=1000,
                     bottom_threshold=100,
                     query_size_heuristic=10):

        result_urls = []
        found_results = []
        total_results = None

        query_size = self.text_util.words_count(query["exactTerms"])

        for start_index in self.make_query_pages(max_results):

            if total_results is not None and start_index > total_results:
                break

            query["start"] = start_index

            url = self.make_cse_url(query) + "&cx=%s&key=%s" % (self.engine_id,
                                                                self.key)

            logging.debug(url)

            result_urls.append(url)

            try:
                api_response = requests.get(url).json()
            except Exception:
                logging.error("Error while calling '%s':%s" %
                              (url, traceback.format_exc()))
                api_response = None

            if api_response is None:
                logging.warn("API response is None")
                break

            if "searchInformation" not in api_response:
                logging.warn("Search information not found")
                if total_results is None:
                    break

            if total_results is None:
                total_results = int(
                    api_response["searchInformation"]["totalResults"])

            if total_results > upper_threshold:
                logging.warn("Total results it too large %d" % total_results)
                break

            if total_results > bottom_threshold and query_size < query_size_heuristic:
                logging.warn("Too many results for too short query %d" %
                             total_results)
                break

            if "items" not in api_response or len(api_response["items"]) == 0:
                if total_results is None:
                    logging.warn("Items not found. Stop.")
                    break
                else:
                    logging.warn("Item not found. Skip.")
                    continue

            found_results.extend(api_response["items"])

        return found_results, result_urls, 0 if total_results is None else total_results

    def __repr__(self):
        return "<CseAPI(key='%s', engine='%s')>" % (self.key[:8] + "..",
                                                    self.engine_id[:8] + "..")
コード例 #14
0
ファイル: rd.py プロジェクト: LucidAi/nlcd
class ReferenceIndex(object):

    """
    Index
    """

    def __init__(self, entries):

        self.id2entry = {}
        self.text_index = {}

        self.text_util = TextUtil()

        for entry in entries:
            self.id2entry[entry.ref_id] = entry

    def iterentries(self):
        return self.id2entry.itervalues()

    def index(self):

        for entry in self.id2entry.itervalues():

            sentences = self.text_util.sent_tokenize(entry.text) + [entry.title]
            quotes = self.text_util.extract_quoted(entry.text)
            sentences = self.text_util.select_segments(sentences, quotes, min_size=5)
            sentences = sanitize_sentences(sentences)

            for sent in sentences:
                if sent in self.text_index:
                    self.text_index[sent].add(entry.ref_id)
                else:
                    self.text_index[sent] = {entry.ref_id}

    def find_text_references(self, entry):

        sentences = self.text_util.sent_tokenize(entry.text) + [entry.title]
        quotes = self.text_util.extract_quoted(entry.text)
        sentences = self.text_util.select_segments(sentences, quotes, min_size=5)
        sentences = sanitize_sentences(sentences)

        sent_refs = {}

        for sent in sentences:

            found = self.text_index.get(sent)

            for ref_id in found:
                if ref_id == entry.ref_id:
                    continue
                if sent in sent_refs:
                    sent_refs[sent].add(ref_id)
                else:
                    sent_refs[sent] = {ref_id}

        if len(sent_refs) == 0:
            return []

        uniq_refs = set()
        for ref_id_set in sent_refs.itervalues():
            uniq_refs.update(ref_id_set)

        ref_pairs = [(ref_id, entry.ref_id) for ref_id in uniq_refs]

        return ref_pairs


    def extract_query_sentence(self, entry, trim=32):
        sentences = self.text_util.sent_tokenize(entry.text) + [entry.title]
        quotes = self.text_util.extract_quoted(entry.text)
        sentences = self.text_util.select_segments(sentences, quotes, min_size=5)
        sentences = [self.text_util.simplified_text(s) for s in sentences]
        if trim is not None and trim > 0:
            for i in xrange(len(sentences)):
                sentences[i] = " ".join(sentences[i].split()[:trim])
        return sentences

    def extract_references(self, entries):
        found_links = set()
        for entry in entries:
            entry_refs = self.find_text_references(entry)
            found_links.update(entry_refs)
        return found_links

    def fuzzy_extract_references(self, entries):

        entries = list(entries)
        sentence2entry = {}

        # Extract sentences
        logging.info("Extracting sentences.")
        for entry in entries:

            q_sentences = self.extract_query_sentence(entry, trim=32)

            for sent in q_sentences:

                if sent not in sentence2entry:
                    sentence2entry[sent] = {entry.ref_id}
                else:
                    sentence2entry[sent].add(entry.ref_id)

        # Compile regexes
        logging.info("Compiling regexes.")
        sentence2regex = {}
        for sent in sentence2entry.iterkeys():

            if sent not in sentence2regex:
                regex = self.text_util.compile_fuzzy_pattern(sent)
                sentence2regex[sent] = regex
        logging.info("Compiled %d." % len(sentence2regex))

        # Find text matches
        logging.info("Fuzzy matching.")

        entry2matches = {}

        for i, entry in enumerate(entries):

            entry2matches[entry.ref_id] = set()

            for sent, regex in sentence2regex.iteritems():

                if self.text_util.ffs(entry.body, sent, regex):

                    sentence_entries = sentence2entry[sent]

                    for ref_id in sentence_entries:

                        if ref_id == entry.ref_id:
                            continue

                        entry2matches[entry.ref_id].add(ref_id)

            logging.info("Done %d/%d." % (len(entries), i + 1))

        found_pairs = []
        for entry, matches in entry2matches.iteritems():
            for m in matches:
                found_pairs.append((entry, m))

        return found_pairs

    def find_cross_references(self, sent_window_size=3):


        found_links = self.fuzzy_extract_references(self.iterentries())

        return found_links

    def print_titles(self):
        for entry in self.id2entry.itervalues():
            print entry.ref_id, "\t * %s" % entry.title

    def __repr__(self):

        return "<RefIndex(entries=%d)>" % len(self.id2entry)