def execute(self, context):
        with safe_import():
            from reach.refparse.refparse import exact_match_publication

        publications_path = 's3://{path}'.format(path=self.publications_path, )
        exact_matched_references_path = 's3://{path}'.format(
            path=self.exact_matched_references_path, )

        s3 = WellcomeS3Hook(aws_conn_id=self.aws_conn_id)

        exact_matcher = ElasticsearchExactMatcher(self.es,
                                                  self.title_length_threshold)

        with tempfile.NamedTemporaryFile(mode='wb') as output_raw_f:
            with gzip.GzipFile(mode='wb', fileobj=output_raw_f) as output_f:
                publications = yield_publications(s3, publications_path)
                for publication in publications:
                    exact_matched_references = exact_match_publication(
                        exact_matcher, publication)
                    for exact_matched_reference in exact_matched_references:
                        if exact_matched_reference:
                            logger.info("Match")

                            output_f.write(
                                json.dumps(exact_matched_reference).encode(
                                    'utf-8'))
                            output_f.write(b'\n')

            output_raw_f.flush()

            s3.load_file(
                filename=output_raw_f.name,
                key=exact_matched_references_path,
                replace=True,
            )
Example #2
0
    def execute(self, context):
        with safe_import():
            from reach.refparse.refparse import yield_structured_references

        pool_map = map
        s3 = WellcomeS3Hook(aws_conn_id=self.aws_conn_id)

        with tempfile.NamedTemporaryFile(
        ) as split_rawf, tempfile.NamedTemporaryFile() as parsed_rawf:
            with gzip.GzipFile(mode='wb',
                               fileobj=split_rawf) as split_f, gzip.GzipFile(
                                   mode='wb', fileobj=parsed_rawf) as parsed_f:
                refs = yield_structured_references(self.src_s3_key, pool_map,
                                                   logger)
                for split_references, parsed_references in refs:
                    split_f.write(json.dumps(split_references).encode('utf-8'))
                    split_f.write(b'\n')
                    for ref in parsed_references:
                        parsed_f.write(json.dumps(ref).encode('utf-8'))
                        parsed_f.write(b'\n')

            split_rawf.flush()
            parsed_rawf.flush()

            s3.load_file(
                filename=split_rawf.name,
                key=self.split_s3_key,
                replace=True,
            )

            s3.load_file(
                filename=parsed_rawf.name,
                key=self.parsed_s3_key,
                replace=True,
            )
Example #3
0
    def execute(self, context):
        with safe_import():
            from reach.refparse.refparse import fuzzy_match_reference

        s3 = WellcomeS3Hook(aws_conn_id=self.aws_conn_id)

        fuzzy_matcher = ElasticsearchFuzzyMatcher(
            self.es,
            self.score_threshold,
            self.should_match_threshold,
            self.es_index,
            self.organisation,
        )
        refs = yield_structured_references(s3, self.src_s3_key)
        match_count = 0
        count = 0
        references = {}
        for count, structured_reference in enumerate(refs, 1):
            if count % 500 == 0:
                logger.info('FuzzyMatchRefsOperator: references=%d', count)
            fuzzy_matched_reference = fuzzy_match_reference(
                fuzzy_matcher, structured_reference)
            if fuzzy_matched_reference:
                ref_id = fuzzy_matched_reference['reference_id']
                if ref_id in references.keys():

                    references[ref_id]['associated_policies_count'] += 1

                    references[ref_id]['policies'].append(
                        fuzzy_matched_reference['policies'][0])
                else:
                    references[ref_id] = fuzzy_matched_reference

                match_count += 1
                if match_count % 100 == 0:
                    logger.info('FuzzyMatchRefsOperator: matches=%d',
                                match_count)

        with tempfile.NamedTemporaryFile(mode='wb') as output_raw_f:
            with gzip.GzipFile(mode='wb', fileobj=output_raw_f) as output_f:
                for reference in references.values():
                    output_f.write(json.dumps(reference).encode('utf-8'))
                    output_f.write(b'\n')

            output_raw_f.flush()
            s3.load_file(
                filename=output_raw_f.name,
                key=self.dst_s3_key,
                replace=True,
            )
            logger.info('FuzzyMatchRefsOperator: references=%d matches=%d',
                        count, match_count)

            logger.info('FuzzyMatchRefsOperator: Matches saved to %s',
                        s3.get_key(self.dst_s3_key))
Example #4
0
    def execute(self, context):
        with safe_import():
            import reach.pdf_parser.main as pdf_parser_main

        os.environ.setdefault('SCRAPY_SETTINGS_MODULE',
                              'scraper.wsf_scraping.settings')
        if not self.src_s3_dir.startswith('s3://'):
            raise ValueError
        if not self.dst_s3_key.startswith('s3://'):
            raise ValueError

        input_uri = 'manifest' + self.src_s3_dir
        pdf_parser_main.parse_all_pdf(
            self.organisation,
            input_uri,
            self.dst_s3_key,
        )
Example #5
0
    def execute(self, context):
        with safe_import():
            from reach.refparse.refparse import fuzzy_match_reference

        s3 = WellcomeS3Hook(aws_conn_id=self.aws_conn_id)

        fuzzy_matcher = ElasticsearchFuzzyMatcher(
            self.es,
            self.score_threshold,
            self.should_match_threshold,
            self.es_index,
        )

        with tempfile.NamedTemporaryFile(mode='wb') as output_raw_f:
            with gzip.GzipFile(mode='wb', fileobj=output_raw_f) as output_f:
                refs = yield_structured_references(s3, self.src_s3_key)
                match_count = 0
                count = 0
                for count, structured_reference in enumerate(refs, 1):
                    if count % 500 == 0:
                        logger.info('FuzzyMatchRefsOperator: references=%d',
                                    count)
                    fuzzy_matched_reference = fuzzy_match_reference(
                        fuzzy_matcher, structured_reference)
                    if fuzzy_matched_reference:
                        match_count += 1
                        if match_count % 100 == 0:
                            logger.info('FuzzyMatchRefsOperator: matches=%d',
                                        match_count)
                        output_f.write(
                            json.dumps(fuzzy_matched_reference).encode(
                                'utf-8'))
                        output_f.write(b'\n')

            output_raw_f.flush()
            s3.load_file(
                filename=output_raw_f.name,
                key=self.dst_s3_key,
                replace=True,
            )
            logger.info('FuzzyMatchRefsOperator: references=%d matches=%d',
                        count, match_count)