def execute(self, context): with safe_import(): from reach.refparse.refparse import exact_match_publication publications_path = 's3://{path}'.format(path=self.publications_path, ) exact_matched_references_path = 's3://{path}'.format( path=self.exact_matched_references_path, ) s3 = WellcomeS3Hook(aws_conn_id=self.aws_conn_id) exact_matcher = ElasticsearchExactMatcher(self.es, self.title_length_threshold) with tempfile.NamedTemporaryFile(mode='wb') as output_raw_f: with gzip.GzipFile(mode='wb', fileobj=output_raw_f) as output_f: publications = yield_publications(s3, publications_path) for publication in publications: exact_matched_references = exact_match_publication( exact_matcher, publication) for exact_matched_reference in exact_matched_references: if exact_matched_reference: logger.info("Match") output_f.write( json.dumps(exact_matched_reference).encode( 'utf-8')) output_f.write(b'\n') output_raw_f.flush() s3.load_file( filename=output_raw_f.name, key=exact_matched_references_path, replace=True, )
def execute(self, context): with safe_import(): from reach.refparse.refparse import yield_structured_references pool_map = map s3 = WellcomeS3Hook(aws_conn_id=self.aws_conn_id) with tempfile.NamedTemporaryFile( ) as split_rawf, tempfile.NamedTemporaryFile() as parsed_rawf: with gzip.GzipFile(mode='wb', fileobj=split_rawf) as split_f, gzip.GzipFile( mode='wb', fileobj=parsed_rawf) as parsed_f: refs = yield_structured_references(self.src_s3_key, pool_map, logger) for split_references, parsed_references in refs: split_f.write(json.dumps(split_references).encode('utf-8')) split_f.write(b'\n') for ref in parsed_references: parsed_f.write(json.dumps(ref).encode('utf-8')) parsed_f.write(b'\n') split_rawf.flush() parsed_rawf.flush() s3.load_file( filename=split_rawf.name, key=self.split_s3_key, replace=True, ) s3.load_file( filename=parsed_rawf.name, key=self.parsed_s3_key, replace=True, )
def execute(self, context): with safe_import(): from reach.refparse.refparse import fuzzy_match_reference s3 = WellcomeS3Hook(aws_conn_id=self.aws_conn_id) fuzzy_matcher = ElasticsearchFuzzyMatcher( self.es, self.score_threshold, self.should_match_threshold, self.es_index, self.organisation, ) refs = yield_structured_references(s3, self.src_s3_key) match_count = 0 count = 0 references = {} for count, structured_reference in enumerate(refs, 1): if count % 500 == 0: logger.info('FuzzyMatchRefsOperator: references=%d', count) fuzzy_matched_reference = fuzzy_match_reference( fuzzy_matcher, structured_reference) if fuzzy_matched_reference: ref_id = fuzzy_matched_reference['reference_id'] if ref_id in references.keys(): references[ref_id]['associated_policies_count'] += 1 references[ref_id]['policies'].append( fuzzy_matched_reference['policies'][0]) else: references[ref_id] = fuzzy_matched_reference match_count += 1 if match_count % 100 == 0: logger.info('FuzzyMatchRefsOperator: matches=%d', match_count) with tempfile.NamedTemporaryFile(mode='wb') as output_raw_f: with gzip.GzipFile(mode='wb', fileobj=output_raw_f) as output_f: for reference in references.values(): output_f.write(json.dumps(reference).encode('utf-8')) output_f.write(b'\n') output_raw_f.flush() s3.load_file( filename=output_raw_f.name, key=self.dst_s3_key, replace=True, ) logger.info('FuzzyMatchRefsOperator: references=%d matches=%d', count, match_count) logger.info('FuzzyMatchRefsOperator: Matches saved to %s', s3.get_key(self.dst_s3_key))
def execute(self, context): with safe_import(): import reach.pdf_parser.main as pdf_parser_main os.environ.setdefault('SCRAPY_SETTINGS_MODULE', 'scraper.wsf_scraping.settings') if not self.src_s3_dir.startswith('s3://'): raise ValueError if not self.dst_s3_key.startswith('s3://'): raise ValueError input_uri = 'manifest' + self.src_s3_dir pdf_parser_main.parse_all_pdf( self.organisation, input_uri, self.dst_s3_key, )
def execute(self, context): with safe_import(): from reach.refparse.refparse import fuzzy_match_reference s3 = WellcomeS3Hook(aws_conn_id=self.aws_conn_id) fuzzy_matcher = ElasticsearchFuzzyMatcher( self.es, self.score_threshold, self.should_match_threshold, self.es_index, ) with tempfile.NamedTemporaryFile(mode='wb') as output_raw_f: with gzip.GzipFile(mode='wb', fileobj=output_raw_f) as output_f: refs = yield_structured_references(s3, self.src_s3_key) match_count = 0 count = 0 for count, structured_reference in enumerate(refs, 1): if count % 500 == 0: logger.info('FuzzyMatchRefsOperator: references=%d', count) fuzzy_matched_reference = fuzzy_match_reference( fuzzy_matcher, structured_reference) if fuzzy_matched_reference: match_count += 1 if match_count % 100 == 0: logger.info('FuzzyMatchRefsOperator: matches=%d', match_count) output_f.write( json.dumps(fuzzy_matched_reference).encode( 'utf-8')) output_f.write(b'\n') output_raw_f.flush() s3.load_file( filename=output_raw_f.name, key=self.dst_s3_key, replace=True, ) logger.info('FuzzyMatchRefsOperator: references=%d matches=%d', count, match_count)