Example #1
0
    def __init__(self,
                 qrels_input_path,
                 qrels_output_path,
                 topics_input_path,
                 topics_output_path,
                 corpus_input_path,
                 corpus_output_path,
                 include_linked=False,
                 query_sample_size=None):
        self.qrels_input_path = qrels_input_path
        self.qrels_output_path = qrels_output_path
        self.topics_input_path = topics_input_path
        self.topics_output_path = topics_output_path
        self.corpus_input_path = corpus_input_path
        self.corpus_output_path = corpus_output_path
        self.include_linked = include_linked
        self.query_sample_size = query_sample_size

        if os.path.exists(self.corpus_output_path):
            raise ArmyAntException("%s already exists" %
                                   self.corpus_output_path)

        qrels_dir = os.path.dirname(self.qrels_output_path)
        if os.path.exists(qrels_dir):
            raise ArmyAntException("%s already exists" % qrels_dir)

        topics_dir = os.path.dirname(self.topics_output_path)
        if os.path.exists(topics_dir):
            raise ArmyAntException("%s already exists" % topics_dir)

        os.makedirs(self.corpus_output_path)
        os.makedirs(qrels_dir)
        os.makedirs(topics_dir)
Example #2
0
    def __init__(self, task, eval_location):
        super().__init__(task, eval_location)

        self.o_results_path = os.path.join(eval_location, 'results', task._id)
        self.o_assessments_path = os.path.join(eval_location, 'assessments', task._id)

        try:
            os.makedirs(self.o_results_path)
        except FileExistsError:
            raise ArmyAntException("Results directory '%s' already exists" % self.o_results_path)

        try:
            os.makedirs(self.o_assessments_path)
        except FileExistsError:
            raise ArmyAntException("Assessments directory '%s' already exists" % self.o_assessments_path)
Example #3
0
    async def index(self, features_location=None, pgonly=True):
        if os.path.exists(self.index_location):
            raise ArmyAntException("%s already exists" % self.index_location)

        os.mkdir(self.index_location)

        async for item in super().index(pgonly=pgonly):
            yield item

        conn = psycopg2.connect(
            "dbname='army_ant' user='******' host='localhost'")
        c = conn.cursor()

        logging.info("Creating term nodes CSV file")
        with open(os.path.join(self.index_location, 'term-nodes.csv'),
                  'w') as f:
            c.copy_expert(
                """COPY (SELECT node_id AS "node_id:ID", attributes->'name'->0->>'value' AS name, """
                """label AS ":LABEL" FROM nodes) TO STDOUT WITH CSV HEADER""",
                f)

        logging.info("Creating in_window_of edges CSV file")
        with open(os.path.join(self.index_location, 'in_window_of-edges.csv'),
                  'w') as f:
            c.copy_expert(
                """COPY (SELECT source_node_id AS ":START_ID", attributes->>'doc_id' AS doc_id, """
                """target_node_id AS ":END_ID", label AS ":TYPE" FROM edges) TO STDOUT WITH CSV HEADER""",
                f)
Example #4
0
    def __init__(self, db_location, db_name, eval_location):
        self.tasks = []
        self.running = None

        self.eval_location = eval_location
        self.results_dirname = os.path.join(eval_location, 'results')
        self.assessments_dirname = os.path.join(eval_location, 'assessments')
        self.spool_dirname = os.path.join(eval_location, 'spool')

        os.makedirs(self.results_dirname, exist_ok=True)
        os.makedirs(self.assessments_dirname, exist_ok=True)
        os.makedirs(self.spool_dirname, exist_ok=True)

        db_location_parts = db_location.split(':')

        if len(db_location_parts) > 1:
            db_location = db_location_parts[0]
            db_port = int(db_location_parts[1])
        else:
            db_port = 27017

        try:
            self.client = MongoClient(db_location, db_port)
        except ConnectionFailure:
            raise ArmyAntException(
                "Could not connect to MongoDB instance on %s:%s" %
                (db_location, db_port))

        self.db = self.client[db_name]

        self.db['evaluation_tasks'].create_index('run_id', unique=True)
Example #5
0
    def __init__(self, source_path):
        super(MongoDBReader, self).__init__(source_path)

        db_location_parts = re.split(r'[:/]', source_path)

        if len(db_location_parts) >= 3:
            db_host = db_location_parts[0]
            db_port = int(db_location_parts[1])
            db_name = db_location_parts[1]
        elif len(db_location_parts) == 2:
            db_host = db_location_parts[0]
            db_port = 27017
            db_name = db_location_parts[1]
        else:
            db_host = 'localhost'
            db_port = 27017
            db_name = db_location_parts[0]

        try:
            self.client = MongoClient(db_host, db_port)
        except ConnectionFailure:
            raise ArmyAntException(
                "Could not connect to MongoDB instance on %s:%s" %
                (db_host, db_port))

        self.db = self.client[db_name]
Example #6
0
    async def index(self, features_location=None):
        if not features_location:
            raise ArmyAntException(
                "Must provide a features location with topics.txt and qrels.txt files"
            )

        topics = self.load_topics(features_location)
        qrels = self.load_qrels(features_location)

        async for doc in self.lucene_engine.index(
                features_location=features_location):
            self.process_web_features(doc)
            yield doc

        ltr_helper = TensorFlowRanking.JLearningToRankHelper(
            self.lucene_index_location)
        ltr_helper.computeDocumentFeatures()
        j_graph_based_features = self.j_build_graph_based_features()
        ltr_helper.updateDocumentFeatures(j_graph_based_features)
        train_set = self.build_train_set(ltr_helper, topics, qrels)

        pd_train_generator = self.pandas_generator(train_set)

        logger.info("Training model")
        hparams = tf.contrib.training.HParams(learning_rate=0.05)
        ranker = self.get_estimator(hparams)
        ranker.train(input_fn=lambda: self.input_fn(pd_train_generator),
                     steps=100)
Example #7
0
    async def search(self,
                     query,
                     offset,
                     limit,
                     query_type=None,
                     task=None,
                     base_index_location=None,
                     base_index_type=None,
                     ranking_function=None,
                     ranking_params=None,
                     debug=False):
        try:
            self.cluster = await Cluster.open(self.loop,
                                              hosts=[self.index_host],
                                              port=self.index_port)
        except ClientConnectorError:
            raise ArmyAntException(
                "Could not connect to Gremlin Server on %s:%s" %
                (self.index_host, self.index_port))

        self.client = await self.cluster.connect()

        query_tokens = GraphOfWord.analyze(query)

        result_set = await self.client.submit(
            ('g = %s.traversal()\n' % self.graph) +
            load_gremlin_script('graph_of_word_query'), {
                'queryTokens': query_tokens,
                'offset': offset,
                'limit': limit
            })
        results = await result_set.one()
        await self.cluster.close()

        return results
Example #8
0
    def get_topic_assessments(self):
        topic_doc_judgements = {}

        if not os.path.exists(self.task.assessments_path):
            raise ArmyAntException("Topic assessments file not found: %s" %
                                   self.task.assessments_path)

        with open(self.task.assessments_path, 'r') as f:
            for line in f:
                if self.retrieval_task == Index.RetrievalTask.entity_retrieval:
                    topic_id, _, id, _, judgement = line.split(' ', 4)
                    judgement = int(judgement)
                    if judgement == 2:
                        judgement = 0
                else:
                    topic_id, _, id, judgement, _ = line.split(' ', 4)
                    judgement = int(judgement)
                    if judgement > 0:
                        judgement = 1

                if topic_id not in topic_doc_judgements:
                    topic_doc_judgements[topic_id] = {}
                topic_doc_judgements[topic_id][id] = judgement

        return topic_doc_judgements
Example #9
0
    def factory(source_path,
                source_reader,
                features_location=None,
                limit=None):
        import army_ant.reader as rd

        if source_reader == 'wikipedia_data':
            return rd.WikipediaDataReader(source_path)
        elif source_reader == 'inex':
            return rd.INEXReader(source_path,
                                 include_dbpedia=False,
                                 limit=limit)
        elif source_reader == 'inex_dbpedia':
            return rd.INEXReader(source_path,
                                 include_dbpedia=True,
                                 limit=limit)
        elif source_reader == 'inex_dir':
            return rd.INEXDirectoryReader(source_path,
                                          include_dbpedia=False,
                                          limit=limit)
        elif source_reader == 'inex_dir_dbpedia':
            return rd.INEXDirectoryReader(source_path,
                                          include_dbpedia=True,
                                          limit=limit)
        elif source_reader == 'living_labs':
            return rd.LivingLabsReader(source_path, limit)
        elif source_reader == 'wapo':
            return rd.TRECWashingtonPostReader(source_path, limit=limit)
        elif source_reader == 'wapo_doc_profile':
            return rd.TRECWashingtonPostReader(
                source_path,
                features_location=features_location,
                include_ae_doc_profile=True,
                limit=limit)
        elif source_reader == 'wapo_dbpedia':
            return rd.TRECWashingtonPostReader(source_path,
                                               include_dbpedia=True,
                                               limit=limit)
        elif source_reader == 'wapo_doc_profile_dbpedia':
            return rd.TRECWashingtonPostReader(
                source_path,
                features_location=features_location,
                include_ae_doc_profile=True,
                include_dbpedia=True,
                limit=limit)
        elif source_reader == 'csv':
            return rd.CSVReader(source_path)
        # elif source_reader == 'gremlin':
        #     return rd.GremlinReader(source_path)
        else:
            raise ArmyAntException("Unsupported source reader %s" %
                                   source_reader)
Example #10
0
 async def search(self,
                  query,
                  offset,
                  limit,
                  query_type=None,
                  task=None,
                  base_index_location=None,
                  base_index_type=None,
                  ranking_function=None,
                  ranking_params=None,
                  debug=False):
     raise ArmyAntException("Search not implemented for %s" %
                            self.__class__.__name__)
Example #11
0
    def queue(self):
        duplicate_error = False
        run_id_error = False

        inserted_ids = []
        for task in self.tasks:
            run_id_error = task.run_id is None or task.run_id.strip() == ''
            if run_id_error:
                continue
            try:
                task.time = int(round(time.time() * 1000))
                result = self.db['evaluation_tasks'].insert_one(task.__dict__)
                inserted_ids.append(result.inserted_id)
            except DuplicateKeyError:
                duplicate_error = True

        if duplicate_error:
            raise ArmyAntException("The Run ID must be unique.")

        if run_id_error:
            raise ArmyAntException("Tasks without a Run ID are not accepted")

        return inserted_ids
Example #12
0
    async def index(self, features_location=None):
        if not features_location:
            raise ArmyAntException(
                "Must provide a features location with topics.txt and qrels.txt files"
            )

        async for doc in self.lucene_engine.index(
                features_location=features_location):
            yield doc

        features_helper = LuceneFeaturesEngine.JFeaturesHelper(
            self.lucene_index_location)
        j_features = self.j_load_features(features_location)
        features_helper.setDocumentFeatures(j_features)
Example #13
0
def kendall_w(pd_dfs):
    pd_dfs = fill_missing(pd_dfs,
                          'id',
                          rank=FillMethod.INC_MAX,
                          score=FillMethod.ZERO)

    rankings = np.stack([df.sort_values('id')['rank'] for df in pd_dfs],
                        axis=0)

    if rankings.ndim != 2:
        raise ArmyAntException('Rankings matrix must be 2-dimensional')

    m = rankings.shape[0]  # rankers
    n = rankings.shape[1]  # documents

    return (12 * n * np.var(np.sum(rankings, axis=0))) / (m**2 * (n**3 - n))
Example #14
0
    def get_topic_assessments(self):
        topic_doc_judgements = {}

        if not os.path.exists(self.task.assessments_path):
            raise ArmyAntException("Topic assessments file not found: %s" %
                                   self.task.assessments_path)

        with open(self.task.assessments_path, 'r') as f:
            for line in f:
                topic_id, _, id, judgement = line.split(' ')

                if topic_id not in topic_doc_judgements:
                    topic_doc_judgements[topic_id] = {}
                topic_doc_judgements[topic_id][id] = int(judgement)

        return topic_doc_judgements
Example #15
0
    def __init__(self,
                 source_path,
                 doc_id_suffix=':doc_id',
                 text_suffix=':text'):
        super(CSVReader, self).__init__(source_path)

        self.reader = csv.DictReader(open(source_path, newline=''))
        self.doc_id_suffix = doc_id_suffix
        self.text_suffix = text_suffix

        if not any([
                fieldname.endswith(self.text_suffix)
                for fieldname in self.reader.fieldnames
        ]):
            raise ArmyAntException(
                "CSV must have at least one column name with a %s suffix (other supported suffixes include %s)"
                % (self.text_suffix, self.doc_id_suffix))
Example #16
0
    def __init__(self, db_location, db_name, loop):
        super().__init__(db_location, db_name, loop)

        db_location_parts = db_location.split(':')

        if len(db_location_parts) > 1:
            db_location = db_location_parts[0]
            db_port = int(db_location_parts[1])
        else:
            db_port = 27017

        try:
            self.client = MongoClient(db_location, db_port)
        except ConnectionFailure:
            raise ArmyAntException(
                "Could not connect to MongoDB instance on %s:%s" %
                (db_location, db_port))

        self.db = self.client[self.db_name]
Example #17
0
    def factory(task, eval_location):
        import army_ant.evaluation as evl

        if task.eval_format == 'inex':
            return evl.INEXEvaluator(task, eval_location,
                                     Index.QueryType.keyword,
                                     Index.RetrievalTask.document_retrieval)
        if task.eval_format == 'inex-xer':
            return evl.INEXEvaluator(task, eval_location,
                                     Index.QueryType.keyword,
                                     Index.RetrievalTask.entity_retrieval)
        if task.eval_format == 'inex-xer-elc':
            return evl.INEXEvaluator(task, eval_location,
                                     Index.QueryType.entity,
                                     Index.RetrievalTask.entity_retrieval)
        if task.eval_format == 'trec':
            return evl.TRECEvaluator(task, eval_location)
        elif task.eval_format == 'll-api':
            return evl.LivingLabsEvaluator(task, eval_location)
        else:
            raise ArmyAntException("Unsupported evaluator format")
Example #18
0
    def open(index_location, index_type, loop):
        import army_ant.index as idx

        key = Index.__preloaded_key__(index_location, index_type)
        if key in Index.PRELOADED:
            return Index.PRELOADED[key]

        index_features = index_type.split(':')[1:]

        if index_type == 'gow':
            return idx.GraphOfWord(None, index_location, loop)
        elif index_type == 'goe':
            return idx.GraphOfEntity(None, index_location, loop)
        elif index_type == 'gow_batch':
            return idx.GraphOfWordBatch(None, index_location, loop)
        elif index_type == 'goe_batch':
            return idx.GraphOfEntityBatch(None, index_location, loop)
        elif index_type == 'gow_csv':
            return idx.GraphOfWordCSV(None, index_location, loop)
        elif index_type == 'goe_csv':
            return idx.GraphOfEntityCSV(None, index_location, loop)
        elif index_type == 'gremlin':
            return idx.GremlinServerIndex(None, index_location, loop)
        elif index_type.startswith('hgoe'):
            return idx.HypergraphOfEntity(None, index_location, index_features,
                                          loop)
        elif index_type.startswith('lucene_features'):
            return idx.LuceneEngine(None, index_location, index_features, loop)
        elif index_type.startswith('lucene_entities'):
            return idx.LuceneEntitiesEngine(None, index_location,
                                            index_features, loop)
        elif index_type.startswith('lucene'):
            return idx.LuceneEngine(None, index_location, index_features, loop)
        elif index_type.startswith('tfr'):
            return idx.TensorFlowRanking(None, index_location, index_features,
                                         loop)
        else:
            raise ArmyAntException("Unsupported index type %s" % index_type)
Example #19
0
    def __init__(self, task, eval_location):
        super().__init__(task, eval_location)
        try:
            base_url, api_key, run_id = eval_location.split('::')
        except ValueError:
            raise ArmyAntException(
                "Must provide the base_url, api_key and run_id, separated by '::'"
            )

        self.base_url = urljoin(base_url, 'api/v2/participant/')
        self.auth = HTTPBasicAuth(api_key, '')
        self.headers = {'Content-Type': 'application/json'}

        requests_cache.install_cache('living_labs_cache', expire_after=10800)

        self.loop = asyncio.get_event_loop()
        self.index = Index.open(task.index_location, task.index_type,
                                self.loop)

        self.run_id = run_id
        self.pickle_dir = '/opt/army-ant/cache/%s' % run_id
        if not os.path.exists(self.pickle_dir):
            os.mkdir(self.pickle_dir)
Example #20
0
    def factory(reader, index_location, index_type, loop):
        import army_ant.index as idx
        index_features = index_type.split(':')[1:]

        if index_type == 'gow':
            return idx.GraphOfWord(reader, index_location, loop)
        elif index_type == 'goe':
            return idx.GraphOfEntity(reader, index_location, loop)
        elif index_type == 'gow_batch':
            return idx.GraphOfWordBatch(reader, index_location, loop)
        elif index_type == 'goe_batch':
            return idx.GraphOfEntityBatch(reader, index_location, loop)
        elif index_type == 'gow_csv':
            return idx.GraphOfWordCSV(reader, index_location, loop)
        elif index_type == 'goe_csv':
            return idx.GraphOfEntityCSV(reader, index_location, loop)
        elif index_type.startswith('hgoe'):
            return idx.HypergraphOfEntity(reader, index_location,
                                          index_features, loop)
        elif index_type.startswith('lucene_features'):
            return idx.LuceneFeaturesEngine(reader, index_location,
                                            index_features, loop)
        elif index_type.startswith('lucene_entities'):
            return idx.LuceneEntitiesEngine(reader, index_location,
                                            index_features, loop)
        elif index_type.startswith('lucene'):
            return idx.LuceneEngine(reader, index_location, index_features,
                                    loop)
        elif index_type.startswith('tfr'):
            return idx.TensorFlowRanking(reader, index_location,
                                         index_features, loop)
        elif index_type.startswith('null_index'):
            return idx.NullIndex(reader, index_location, loop)
        elif index_type.startswith('text_index'):
            return idx.TextIndex(reader, index_location, index_features, loop)
        else:
            raise ArmyAntException("Unsupported index type %s" % index_type)
Example #21
0
 def load_to_postgres(self, conn, doc):
     raise ArmyAntException("Load function not implemented for %s" %
                            self.__class__.__name__)
Example #22
0
 async def retrieve(self, results):
     raise ArmyAntException("Retrieve not implemented for %s" %
                            self.__class__.__name__)
Example #23
0
 async def store(self, index):
     raise ArmyAntException("Store not implemented for %s" %
                            self.__class__.__name__)
Example #24
0
 def factory(db_location, db_name, db_type, loop):
     if db_type == 'mongo':
         return MongoDatabase(db_location, db_name, loop)
     else:
         raise ArmyAntException("Unsupported database type %s" % db_type)
    async def index(self, features_location=None):
        try:
            if HypergraphOfEntity.Feature.keywords in self.index_features:
                logger.info(
                    "Indexing top %.0f%% keywords per document based on TextRank"
                    % (Index.KW_RATIO * 100))

            index_features_str = ':'.join(
                [index_feature.value for index_feature in self.index_features])
            features = [
                HypergraphOfEntity.JFeature.valueOf(index_feature.value)
                for index_feature in self.index_features
                if index_feature != HypergraphOfEntity.Feature.keywords
            ]

            if HypergraphOfEntity.Feature.context in self.index_features:
                if features_location is None:
                    raise ArmyAntException(
                        "Must provide a features_location pointing to a directory"
                    )
                if 'word2vec_simnet.graphml.gz' not in os.listdir(
                        features_location):
                    raise ArmyAntException(
                        "Must provide a 'word2vec_simnet.graphml.gz' file within features directory"
                    )

            hgoe = HypergraphOfEntity.JHypergraphOfEntityInMemory(
                self.index_location, java.util.Arrays.asList(features),
                features_location, True)

            corpus = []
            for doc in self.reader:
                logger.debug("Preloading document %s (%d triples)" %
                             (doc.doc_id, len(doc.triples)))

                entities = []
                if doc.entities:
                    for entity in doc.entities:
                        try:
                            entities.append(
                                HypergraphOfEntity.JEntity(
                                    entity.label, entity.uri))
                        except Exception as e:
                            logger.warning("Entity %s skipped" % entity)
                            logger.exception(e)

                triples = []
                for s, p, o in doc.triples:
                    try:
                        triples.append(
                            HypergraphOfEntity.JTriple(
                                HypergraphOfEntity.JEntity(s.label, s.uri),
                                HypergraphOfEntity.JEntity(p.label, p.uri),
                                HypergraphOfEntity.JEntity(o.label, o.uri)))
                    except Exception as e:
                        logger.warning("Triple (%s, %s, %s) skipped" %
                                       (s, p, o))
                        logger.exception(e)

                if HypergraphOfEntity.Feature.keywords in self.index_features:
                    doc.text = textrank(doc.text, ratio=Index.KW_RATIO)

                jDoc = HypergraphOfEntity.JDocument(
                    JString(doc.doc_id), JString(doc.title), JString(doc.text),
                    java.util.Arrays.asList(triples),
                    java.util.Arrays.asList(entities))
                corpus.append(jDoc)

                if len(corpus) % (JavaIndex.BLOCK_SIZE // 10) == 0:
                    logger.info("%d documents preloaded" % len(corpus))

                if len(corpus) >= JavaIndex.BLOCK_SIZE:
                    logger.info("Indexing batch of %d documents using %s" %
                                (len(corpus), index_features_str))
                    hgoe.indexCorpus(java.util.Arrays.asList(corpus))
                    corpus = []

                yield Document(doc_id=doc.doc_id,
                               metadata={
                                   'url': doc.metadata.get('url'),
                                   'name': doc.metadata.get('name')
                               })

            if len(corpus) > 0:
                logger.info("Indexing batch of %d documents using %s" %
                            (len(corpus), index_features_str))
                hgoe.indexCorpus(java.util.Arrays.asList(corpus))

            hgoe.postProcessing()

            hgoe.save()
        except JException as e:
            logger.error("Java Exception: %s" % e.stacktrace())
Example #26
0
 def __next__(self):
     raise ArmyAntException("Reader __next__ not implemented")
Example #27
0
 def extract(self):
     raise ArmyAntException("Extract not implemented for %s" %
                            self.__class__.__name__)
Example #28
0
 async def run(self):
     raise ArmyAntException("Unsupported evaluator format %s" %
                            self.task.eval_format)
Example #29
0
    def evaluation(self,
                   index_location,
                   index_type,
                   eval_format,
                   topics_filename=None,
                   assessments_filename=None,
                   base_url=None,
                   api_key=None,
                   run_id=None,
                   output_dir='/opt/army-ant/eval'):
        if eval_format == 'inex' and (topics_filename is None
                                      or assessments_filename is None):
            raise ArmyAntException(
                "Must include the arguments --topics-filename and --assessments-filename"
            )

        if eval_format == 'll-api' and (base_url is None or api_key is None
                                        or run_id is None):
            raise ArmyAntException(
                "Must include the arguments --base-url, --api-key and --run-id"
            )

        if eval_format == 'inex':
            spool_dir = os.path.join(output_dir, 'spool')

            with open(topics_filename, 'rb') as fsrc, \
                    tempfile.NamedTemporaryFile(dir=spool_dir, prefix='eval_topics_', delete=False) as fdst:
                shutil.copyfileobj(fsrc, fdst)
                topics_path = fdst.name

            with open(assessments_filename, 'rb') as fsrc, \
                    tempfile.NamedTemporaryFile(dir=spool_dir, prefix='eval_assessments_', delete=False) as fdst:
                shutil.copyfileobj(fsrc, fdst)
                assessments_path = fdst.name
        else:
            topics_path = None
            assessments_path = None

        # TODO must add query_type, base_indexes, ranking_function and ranking_params
        task = EvaluationTask(index_location=index_location,
                              index_type=index_type,
                              eval_format=eval_format,
                              query_type=None,
                              base_indexes=None,
                              ranking_function=None,
                              ranking_params=None,
                              topics_filename=topics_filename,
                              topics_path=topics_path,
                              assessments_filename=assessments_filename,
                              assessments_path=assessments_path,
                              base_url=base_url,
                              api_key=api_key,
                              run_id=run_id)

        config = yaml.load(open('config.yaml'))
        db_location = config['default'].get('db',
                                            {}).get('location', 'localhost')
        db_name = config['default'].get('db', {}).get('name', 'army_ant')
        manager = EvaluationTaskManager(db_location, db_name, output_dir)

        manager.add_task(task)
        inserted_ids = manager.queue()
        if len(inserted_ids) < 1:
            raise ArmyAntException("Could not queue task")

        loop = asyncio.get_event_loop()
        try:
            loop.run_until_complete(manager.process(task_id=inserted_ids[0]))
        except KeyboardInterrupt:
            for task in asyncio.Task.all_tasks():
                task.cancel()
        finally:
            loop.run_until_complete(loop.shutdown_asyncgens())
            loop.close()
Example #30
0
    async def index(self, features_location=None, pgonly=True):
        if os.path.exists(self.index_location):
            raise ArmyAntException("%s already exists" % self.index_location)

        os.mkdir(self.index_location)

        async for item in super().index(pgonly=pgonly):
            yield item

        conn = psycopg2.connect(
            "dbname='army_ant' user='******' host='localhost'")
        c = conn.cursor()

        logging.info("Creating term nodes CSV file")
        with open(os.path.join(self.index_location, 'term-nodes.csv'),
                  'w') as f:
            c.copy_expert(
                """
            COPY (
                SELECT
                    node_id AS "node_id:ID",
                    attributes->'name'->0->>'value' AS name,
                    attributes->'type'->0->>'value' AS type,
                    label AS ":LABEL"
                FROM nodes
                WHERE label = 'term'
            )
            TO STDOUT WITH CSV HEADER
            """, f)

        logging.info("Creating entity nodes CSV file")
        with open(os.path.join(self.index_location, 'entity-nodes.csv'),
                  'w') as f:
            c.copy_expert(
                """
            COPY (
                SELECT
                    node_id AS "node_id:ID",
                    regexp_replace(attributes->'name'->0->>'value', E'[\\n\\r]', ' ', 'g') AS name,
                    attributes->'type'->0->>'value' AS type,
                    attributes->'doc_id'->0->>'value' AS doc_id,
                    label AS ":LABEL"
                FROM nodes
                WHERE label = 'entity'
            )
            TO STDOUT WITH CSV HEADER
            """, f)

        logging.info("Creating before edges CSV file")
        with open(os.path.join(self.index_location, 'before-edges.csv'),
                  'w') as f:
            c.copy_expert(
                """
            COPY (
                SELECT
                    source_node_id AS ":START_ID",
                    attributes->>'doc_id' AS doc_id,
                    target_node_id AS ":END_ID",
                    label AS ":TYPE"
                FROM edges
                WHERE label = 'before'
            )
            TO STDOUT WITH CSV HEADER
            """, f)

        logging.info("Creating related_to edges CSV file")
        with open(os.path.join(self.index_location, 'related_to-edges.csv'),
                  'w') as f:
            c.copy_expert(
                """
            COPY (
                SELECT
                    source_node_id AS ":START_ID",
                    target_node_id AS ":END_ID",
                    label AS ":TYPE"
                FROM edges
                WHERE label = 'related_to'
            )
            TO STDOUT WITH CSV HEADER
            """, f)

        logging.info("Creating contained_in edges CSV file")
        with open(os.path.join(self.index_location, 'contained_in-edges.csv'),
                  'w') as f:
            c.copy_expert(
                """
            COPY (
                SELECT
                    source_node_id AS ":START_ID",
                    target_node_id AS ":END_ID",
                    label AS ":TYPE"
                FROM edges
                WHERE label = 'contained_in'
            )
            TO STDOUT WITH CSV HEADER
            """, f)