Ejemplo n.º 1
0
    def _handle_tsnecoordinates_view(self, view, whoosh_query):
        cache_key = hashlib.md5(json.dumps(view)+repr(whoosh_query)).hexdigest()
        counts_raw = self.cache.get(cache_key)

        if counts_raw is not None:
            return json.loads(counts_raw)
        else:
            coordinates = {}
            with self.whoosh_index.searcher() as searcher:
                hits = searcher.search(whoosh_query, limit=None)
                logger.debug(self.tracking_code + " whoosh search results: %s" % (repr(hits)))
                for hit in hits:
                    if '2DtSNECoordinates' in hit:
                        refpoints = whooshutils.split_keywords(hit['2DtSNECoordinates'])
                        id = hit['id']
                        sentence = hit['sentence']
                        for refpoint in refpoints:
                            if refpoint:
                                coordinate_splits = whooshutils.split_keywords(refpoint)
                                coordinates[id] = {'x': coordinate_splits[0], 'y': coordinate_splits[1], 'text': sentence}
            result = {
                'coordinates': [{'id': i, 'coordinates': {'x': p['x'], 'y': p['y']}, 'text': p['text']} for i, p in coordinates.iteritems()]
            }

            self.cache.set(cache_key, json.dumps(result))
            return result
Ejemplo n.º 2
0
 def _handle_tsnecoordinates_view(self, view, whoosh_query):
     coordinates = {}
     with self.whoosh_index.searcher() as searcher:
         hits = searcher.search(whoosh_query, limit=None)
         print >> sys.stderr, "whoosh search results: %s" % (repr(hits))
         for hit in hits:
             if '2DtSNECoordinates' in hit:
                 refpoints = whooshutils.split_keywords(
                     hit['2DtSNECoordinates'])
                 id = hit['id']
                 sentence = hit['sentence']
                 for refpoint in refpoints:
                     if refpoint:
                         coordinate_splits = whooshutils.split_keywords(
                             refpoint)
                         coordinates[id] = {
                             'x': coordinate_splits[0],
                             'y': coordinate_splits[1],
                             'text': sentence
                         }
     return {
         'coordinates': [{
             'id': i,
             'coordinates': {
                 'x': p['x'],
                 'y': p['y']
             },
             'text': p['text']
         } for i, p in coordinates.iteritems()]
     }
Ejemplo n.º 3
0
    def _handle_plottimeline_view(self, view, whoosh_query):
        def find_cooccurrences(entities, cooc_fields, need_field, is_disjunctive):
            op = whoosh.query.Or if is_disjunctive else whoosh.query.And
            rel_query = op([whoosh.query.Term(ef, ev) for ef, evs in entities.iteritems() for ev in evs])
            with self.whoosh_index.searcher() as searcher:
                hits = searcher.search(whoosh.query.And([whoosh_query, rel_query]), limit=None)
                cooc_counts = {}
                for hit in hits:
                    if need_field in hit:
                        for cooc_field in cooc_fields:
                            known_field = cooc_field in entities
                            for value in whooshutils.split_keywords(hit[cooc_field]):
                                if not (known_field and value in entities[cooc_field]):
                                    cooc_counts.setdefault((cooc_field, value), 0)
                                    cooc_counts[cooc_field, value] += 1

                cooc_counts = sorted(cooc_counts.iteritems(), key=lambda (e, c): c, reverse=True)
                return cooc_counts[:self.plottimeline_max_cooccurring_entities], len(cooc_counts)

        result = {}

        cluster_field = view['clusterField']
        entities = view['entities']
        if 'cooccurrences' in view:
            is_disjunctive = view['cooccurrences'] == 'or'
            cooc_entities, num_total_coocs = find_cooccurrences(entities, set(view['cooccurrenceFields']), cluster_field, is_disjunctive)
            entities = dict((ef, set(evs)) for ef, evs in entities.iteritems())
            for (entity_field, entity_value), entity_count in cooc_entities:
                entities.setdefault(entity_field, set())
                entities[entity_field].add(entity_value)
            result['numCooccurringEntities'] = num_total_coocs
            result['numIncludedCooccurringEntities'] = len(cooc_entities)

        # Checking for cluster_field per hit below seems to be slightly faster (empirically) than including Every(cluster_field) in the query
        rel_query = whoosh.query.Or([whoosh.query.Term(ef, ev) for ef, evs in entities.iteritems() for ev in evs])
        timeline = dict((ef, dict((ev, {}) for ev in evs)) for ef, evs in entities.iteritems())
        with self.whoosh_index.searcher() as searcher:
            hits = searcher.search(whoosh.query.And([whoosh_query, rel_query]), limit=None)
            for hit in hits:
                if cluster_field in hit:
                    year = int(hit['year'])
                    cluster_values = set(whooshutils.split_keywords(hit[cluster_field]))
                    for entity_field, entity_values in entities.iteritems():
                        hit_entity_values = set(whooshutils.split_keywords(hit[entity_field]))
                        for entity_value in entity_values:
                            if entity_value in hit_entity_values:
                                timeline[entity_field][entity_value].setdefault(year, set())
                                timeline[entity_field][entity_value][year] |= cluster_values
        for entity_field, entity_values in entities.iteritems():
            field_timeline = timeline[entity_field]
            for entity_value in entity_values:
                field_timeline[entity_value] = dict((y, list(cvs)) for y, cvs in field_timeline[entity_value].iteritems())

        result['timeline'] = timeline
        return result
Ejemplo n.º 4
0
        def find_cooccurrences(entities, cooc_fields, need_field,
                               is_disjunctive):
            op = whoosh.query.Or if is_disjunctive else whoosh.query.And
            rel_query = op([
                whoosh.query.Term(ef, ev) for ef, evs in entities.iteritems()
                for ev in evs
            ])
            with self.whoosh_index.searcher() as searcher:
                hits = searcher.search(whoosh.query.And(
                    [whoosh_query, rel_query]),
                                       limit=None)
                cooc_counts = {}
                for hit in hits:
                    if need_field in hit:
                        for cooc_field in cooc_fields:
                            known_field = cooc_field in entities
                            for value in whooshutils.split_keywords(
                                    hit[cooc_field]):
                                if not (known_field
                                        and value in entities[cooc_field]):
                                    cooc_counts.setdefault((cooc_field, value),
                                                           0)
                                    cooc_counts[cooc_field, value] += 1

                cooc_counts = sorted(cooc_counts.iteritems(),
                                     key=lambda (e, c): c,
                                     reverse=True)
                return cooc_counts[:self.
                                   plottimeline_max_cooccurring_entities], len(
                                       cooc_counts)
Ejemplo n.º 5
0
    def generate_field_counts(self, response, views, whoosh_query):
        """
    Handles all the count by field value views for a query. All values of a
    multiple-valued field are counted.
    """

        print >> sys.stderr, "generating field counts for fields: %s" % (
            ' '.join(v['field'] for v in views.itervalues()))

        for view_id, view in views.iteritems():
            response[view_id] = {'counts': {}}

        with self.whoosh_index.searcher() as searcher:
            hits = searcher.search(whoosh_query, limit=None)
            print >> sys.stderr, "whoosh search results: %s" % (repr(hits))
            for hit in hits:
                for view_id, view in views.iteritems():
                    field = view['field']
                    field = backend_domain_config.field_name_aliases(
                        field) or field
                    if field in hit:
                        values = set(
                            v for v in whooshutils.split_keywords(hit[field]))
                        counts = response[view_id]['counts']
                        for value in values:
                            counts.setdefault(value, 0)
                            counts[value] += 1

        for view_id, view in views.iteritems():
            counts = response[view_id]['counts'].items()
            counts.sort(key=lambda (v, c): c, reverse=True)
            response[view_id]['counts'] = counts
Ejemplo n.º 6
0
    def generate_field_counts(self, response, views, whoosh_query):
        """
        Handles all the count by field value views for a query. All values of a
        multiple-valued field are counted.
        """

        logger.debug(self.tracking_code + " generating field counts for fields: %s" % (' '.join(v['field'] for v in views.itervalues())))

        for view_id, view in views.iteritems():
            response[view_id] = {'counts': {}}

        logger.debug(self.tracking_code + " whoosh_query: " + repr(whoosh_query))
        logger.debug(self.tracking_code + " view: " + json.dumps(views))

        with self.whoosh_index.searcher() as searcher:
            hits = searcher.search(whoosh_query, limit=None)
            logger.info(self.tracking_code + " whoosh search results: %s" % (repr(hits)))
            for hit in hits:
                for view_id, view in views.iteritems():
                    field = view['field']
                    field = domain_config.field_name_aliases(field) or field
                    if field in hit:
                        values = set(v for v in whooshutils.split_keywords(hit[field]))
                        counts = response[view_id]['counts']
                        for value in values:
                            counts.setdefault(value, 0)
                            counts[value] += 1

        for view_id, view in views.iteritems():
            counts = response[view_id]['counts'].items()
            counts.sort(key=lambda (v, c): c, reverse=True)
            response[view_id]['counts'] = counts
Ejemplo n.º 7
0
  def generate_field_counts(self, response, views, whoosh_query):
    """
    Handles all the count by field value views for a query. All values of a
    multiple-valued field are counted.
    """

    for view_id, view in views.iteritems():
      response[view_id] = { 'counts': {} }

    with self.whoosh_index.searcher() as searcher:
      hits = searcher.search(whoosh_query, limit=None)
      print >> sys.stderr, "whoosh search results: %s" % (repr(hits))
      for hit in hits:
        for view_id, view in views.iteritems():
          field = view['field']
          if field in hit:
            values = set(v for v in whooshutils.split_keywords(hit[field]))
            counts = response[view_id]['counts']
            for value in values:
              counts.setdefault(value, 0)
              counts[value] += 1

    for view_id, view in views.iteritems():
      counts = response[view_id]['counts'].items()
      counts.sort(key=lambda (v, c): c, reverse=True)
      response[view_id]['counts'] = counts
Ejemplo n.º 8
0
 def _handle_tsnecoordinates_view(self, view, whoosh_query):
   coordinates = {}
   with self.whoosh_index.searcher() as searcher:
     hits = searcher.search(whoosh_query, limit=None)
     print >> sys.stderr, "whoosh search results: %s" % (repr(hits))
     for hit in hits:
       if '2DtSNECoordinates' in hit:
         refpoints = whooshutils.split_keywords(hit['2DtSNECoordinates'])
         id = hit['id']
         sentence = hit['sentence']
         for refpoint in refpoints:
           if refpoint:
             coordinate_splits = whooshutils.split_keywords(refpoint)
             coordinates[id] = {'x': coordinate_splits[0], 'y': coordinate_splits[1], 'text': sentence}
   return {
     'coordinates': [{ 'id': i, 'coordinates': {'x': p['x'], 'y': p['y']}, 'text': p['text'] } for i, p in coordinates.iteritems()]
   }
Ejemplo n.º 9
0
    def _handle_tsnecoordinates_view(self, view, whoosh_query):
        cache_key = hashlib.md5(json.dumps(view) +
                                repr(whoosh_query)).hexdigest()
        counts_raw = self.cache.get(cache_key)

        if counts_raw is not None:
            return json.loads(counts_raw)
        else:
            coordinates = {}
            with self.whoosh_index.searcher() as searcher:
                hits = searcher.search(whoosh_query, limit=None)
                logger.debug(self.tracking_code +
                             " whoosh search results: %s" % (repr(hits)))
                for hit in hits:
                    if '2DtSNECoordinates' in hit:
                        refpoints = whooshutils.split_keywords(
                            hit['2DtSNECoordinates'])
                        id = hit['id']
                        sentence = hit['sentence']
                        for refpoint in refpoints:
                            if refpoint:
                                coordinate_splits = whooshutils.split_keywords(
                                    refpoint)
                                coordinates[id] = {
                                    'x': coordinate_splits[0],
                                    'y': coordinate_splits[1],
                                    'text': sentence
                                }
            result = {
                'coordinates': [{
                    'id': i,
                    'coordinates': {
                        'x': p['x'],
                        'y': p['y']
                    },
                    'text': p['text']
                } for i, p in coordinates.iteritems()]
            }

            self.cache.set(cache_key, json.dumps(result))
            return result
Ejemplo n.º 10
0
        def find_cooccurrences(entities, cooc_fields, need_field, is_disjunctive):
            op = whoosh.query.Or if is_disjunctive else whoosh.query.And
            rel_query = op([whoosh.query.Term(ef, ev) for ef, evs in entities.iteritems() for ev in evs])
            with self.whoosh_index.searcher() as searcher:
                hits = searcher.search(whoosh.query.And([whoosh_query, rel_query]), limit=None)
                cooc_counts = {}
                for hit in hits:
                    if need_field in hit:
                        for cooc_field in cooc_fields:
                            known_field = cooc_field in entities
                            for value in whooshutils.split_keywords(hit[cooc_field]):
                                if not (known_field and value in entities[cooc_field]):
                                    cooc_counts.setdefault((cooc_field, value), 0)
                                    cooc_counts[cooc_field, value] += 1

                cooc_counts = sorted(cooc_counts.iteritems(), key=lambda (e, c): c, reverse=True)
                return cooc_counts[:self.plottimeline_max_cooccurring_entities], len(cooc_counts)
Ejemplo n.º 11
0
 def _handle_referencepointlinks_view(self, view, whoosh_query):
     link_counts = {}
     with self.whoosh_index.searcher() as searcher:
         hits = searcher.search(whoosh_query, limit=None)
         logger.debug(self.tracking_code + " whoosh search results: %s" % (repr(hits)))
         for hit in hits:
             refpoints = whooshutils.split_keywords(hit['referencePoints'])
             for i, refpoint1 in enumerate(refpoints):
                 for refpoint2 in refpoints[i+1:]:
                     if refpoint1 != refpoint2:
                         # Use lexicographic order to guarantee unique choices of two distinct reference points
                         pair = (refpoint1, refpoint2) if refpoint1 < refpoint2 else (refpoint2, refpoint1)
                         link_counts.setdefault(pair, 0)
                         link_counts[pair] += 1
     return {
         'links': [{'refpoints': p, 'count': c} for (p, c) in link_counts.iteritems()]
     }
Ejemplo n.º 12
0
 def _handle_referencepointlinks_view(self, view, whoosh_query):
     link_counts = {}
     with self.whoosh_index.searcher() as searcher:
         hits = searcher.search(whoosh_query, limit=None)
         print >> sys.stderr, "whoosh search results: %s" % (repr(hits))
         for hit in hits:
             refpoints = whooshutils.split_keywords(hit['referencePoints'])
             for i, refpoint1 in enumerate(refpoints):
                 for refpoint2 in refpoints[i + 1:]:
                     if refpoint1 != refpoint2:
                         # Use lexicographic order to guarantee unique choices of two distinct reference points
                         pair = (refpoint1,
                                 refpoint2) if refpoint1 < refpoint2 else (
                                     refpoint2, refpoint1)
                         link_counts.setdefault(pair, 0)
                         link_counts[pair] += 1
     return {
         'links': [{
             'refpoints': p,
             'count': c
         } for (p, c) in link_counts.iteritems()]
     }
Ejemplo n.º 13
0
    def _handle_plottimeline_view(self, view, whoosh_query):
        def find_cooccurrences(entities, cooc_fields, need_field,
                               is_disjunctive):
            op = whoosh.query.Or if is_disjunctive else whoosh.query.And
            rel_query = op([
                whoosh.query.Term(ef, ev) for ef, evs in entities.iteritems()
                for ev in evs
            ])
            with self.whoosh_index.searcher() as searcher:
                hits = searcher.search(whoosh.query.And(
                    [whoosh_query, rel_query]),
                                       limit=None)
                cooc_counts = {}
                for hit in hits:
                    if need_field in hit:
                        for cooc_field in cooc_fields:
                            known_field = cooc_field in entities
                            for value in whooshutils.split_keywords(
                                    hit[cooc_field]):
                                if not (known_field
                                        and value in entities[cooc_field]):
                                    cooc_counts.setdefault((cooc_field, value),
                                                           0)
                                    cooc_counts[cooc_field, value] += 1

                cooc_counts = sorted(cooc_counts.iteritems(),
                                     key=lambda (e, c): c,
                                     reverse=True)
                return cooc_counts[:self.
                                   plottimeline_max_cooccurring_entities], len(
                                       cooc_counts)

        result = {}

        cluster_field = view['clusterField']
        entities = view['entities']
        if 'cooccurrences' in view:
            is_disjunctive = {'and': False, 'or': True}[view['cooccurrences']]
            cooc_entities, num_total_coocs = find_cooccurrences(
                entities, set(view['cooccurrenceFields']), cluster_field,
                is_disjunctive)
            entities = dict((ef, set(evs)) for ef, evs in entities.iteritems())
            for (entity_field, entity_value), entity_count in cooc_entities:
                entities.setdefault(entity_field, set())
                entities[entity_field].add(entity_value)
            result['numCooccurringEntities'] = num_total_coocs
            result['numIncludedCooccurringEntities'] = len(cooc_entities)

        # Checking for cluster_field per hit below seems to be slightly faster (empirically) than including Every(cluster_field) in the query
        rel_query = whoosh.query.Or([
            whoosh.query.Term(ef, ev) for ef, evs in entities.iteritems()
            for ev in evs
        ])
        timeline = dict((ef, dict((ev, {}) for ev in evs))
                        for ef, evs in entities.iteritems())
        with self.whoosh_index.searcher() as searcher:
            hits = searcher.search(whoosh.query.And([whoosh_query, rel_query]),
                                   limit=None)
            for hit in hits:
                if cluster_field in hit:
                    year = int(hit['year'])
                    cluster_values = set(
                        whooshutils.split_keywords(hit[cluster_field]))
                    for entity_field, entity_values in entities.iteritems():
                        hit_entity_values = set(
                            whooshutils.split_keywords(hit[entity_field]))
                        for entity_value in entity_values:
                            if entity_value in hit_entity_values:
                                timeline[entity_field][
                                    entity_value].setdefault(year, set())
                                timeline[entity_field][entity_value][
                                    year] |= cluster_values
        for entity_field, entity_values in entities.iteritems():
            field_timeline = timeline[entity_field]
            for entity_value in entity_values:
                field_timeline[entity_value] = dict(
                    (y, list(cvs))
                    for y, cvs in field_timeline[entity_value].iteritems())

        result['timeline'] = timeline
        return result