def count_word(self): """ Count stop words and output statistics. """ word_list = [] result = ETL().extract_word() count, total = 0, len(result) for row in result: count += 1 test_id = row[0] print(f"{test_id}, {count}/{total}") try: dump = ETL().extract_cdb(test_id) processed = Process(dump).internal_process() except (IndexError, UnicodeDecodeError): continue if "\n\n" in dump: exceptions = dump[dump.index("\n\n") + len("\n\n"):] try: header = "exception throw location:\n" stack = exceptions[exceptions.index(header) + len(header):] except ValueError: continue # extract root cause from exceptions if dump.count(header) > 1: stack = stack[:stack.index("\n\n")] roots = re.findall(r"^\d+:[ ](.+)[ ]at[ ].+", stack, re.M) words = self.obtain_word(roots, processed) word_list += words Log().chart_print(Counter(word_list).most_common(10))
def enrich(plugins, uri, wait=0): if wait: time.sleep(wait) etl = ETL() etl.read_configfile('/etc/opensemanticsearch/etl') etl.read_configfile('/etc/opensemanticsearch/enhancer-rdf') etl.config['plugins'] = plugins.split(',') filename = uri # if exist delete protocoll prefix file:// if filename.startswith("file://"): filename = filename.replace("file://", '', 1) parameters = etl.config.copy() parameters['id'] = uri parameters['filename'] = filename parameters, data = etl.process(parameters=parameters, data={}) return data
def export_row_data_to_index(self, data, rownumber): parameters = self.config.copy() # todo: all content plugins configurated, not only this one parameters['plugins'] = [ 'enhance_path', ] etl = ETL() try: etl.process(parameters=parameters, data=data) # if exception because user interrupted by keyboard, respect this and abbort except KeyboardInterrupt: raise KeyboardInterrupt except BaseException as e: sys.stderr.write("Exception adding CSV row {} : {}".format( rownumber, e.message)) if 'raise_pluginexception' in self.config: if self.config['raise_pluginexception']: raise e
def update_mapping_new_product_id(self): logging.info("Create new product id after pruning dataset") _r_map = self.dataset[['PRODUCT_ID', 'NEW_PRODUCT_ID']] mm = [_r_map['PRODUCT_ID'].unique(), _r_map['NEW_PRODUCT_ID'].unique()] mapping = pd.DataFrame(data=np.array(mm).T, columns=["product_id", "new_product_id"]) logging.info("Save new product id into db") ETL(DATA_PATH, CONFIG_PATH, SCHEMA_PATH).insert_new_product_id_table(mapping)
def main(): etl = ETL() etl.extract() etl.transform() kmeans_model = KMeansModel(etl.observations,'modeling_text', n_clusters=3, n_features=1000) kmeans_model.vectorize() kmeans_model.apply_lsa(n_components=50) kmeans_model.run() kmeans_model.get_metrics()
def setUp(self): self.etl = ETL() data = { 'customer_id': [1, 1], 'order_id': [1, 2], 'order_item_id': [5, 6], 'num_items': [2, 3], 'revenue': [90, 50], 'created_at_date': [datetime(2017, 10, 5), datetime(2017, 10, 12)] } self.test_df = pd.DataFrame.from_dict(data)
def ETI_TEST(): print("Staring ETL Test Job !!!!") var = Variable() var.INPUT_DATA = "/input/employee.csv" csvParser = csv_parser(var.INPUT_DATA) if csvParser.file_exist(var.INPUT_DATA): TABLE_NAME = csvParser.get_table_name( var.INPUT_DATA) # Takes Table name from Filename if var.CREATE_TABLE and not var.RELATION: ob = ETL(TABLE_NAME, var.INPUT_DATA) ob.etl_process(csvParser.check_header(), 20)
def main(): print("Staring ETL Job !!!!") var = Variable() csvParser = csv_parser(var.INPUT_DATA) if csvParser.file_exist(var.INPUT_DATA): TABLE_NAME = csvParser.get_table_name( var.INPUT_DATA) # Takes Table name from Filename if var.CREATE_TABLE and not var.RELATION: ob = ETL(TABLE_NAME, var.INPUT_DATA) ob.etl_process(csvParser.check_header())
def main(): parser = argparse.ArgumentParser( description='CLI for the pldb application.') parser.add_argument( '--update', action='store_true', help='Download season data and update the JSON file and the database.') parser.add_argument( '--table', action='store_true', help= 'Display the current standings table (calculated from data in the database).' ) parser.add_argument( '--club', type=str, default=None, help= 'Display info for all the matches for the given club in the season.') args = parser.parse_args() if args.update: print("Updating season data...") etl = ETL() etl.run() print("done.") elif args.table: query = Query() table_data = query.table() print("#\tClub\tPlayed\tWon\tDrawn\tLost\tGD\tPoints") for rank in range(len(table_data)): row = table_data[rank] print( f"{rank + 1}\t{row['club']}\t{row['matches_played']}\t" f"{row['wins']}\t{row['draws']}\t{row['losses']}\t{row['goal_diff']}\t" f"{row['points']}") elif args.club: query = Query() for match in query.club(args.club): kick_time = match['kickoff'] / 1000 kick_time = datetime.datetime.fromtimestamp(kick_time).strftime( "%a %d %b %H:%M") if match['status'] == 'C': score = f"{match['away_goals']} {match['home_goals']}" else: score = ' @ ' print( f"{kick_time} {match['away_club']['abbr']} " f"{score} {match['home_club']['abbr']} {match['ground']['name']}" )
def main(): """The `main` function, to be called from commandline, like `python src/main.py -c etl.cfg`. Args: -c --config <config_file> the Stetl config file. -s --section <section_name> the section in the Stetl config (ini) file to execute (default is [etl]). -a --args <arglist> substitutable args for symbolic, {arg}, values in Stetl config file, in format "arg1=foo arg2=bar" etc. """ args = parse_args() # Do the ETL etl = ETL(vars(args), args.config_args) etl.run()
def __init__(self, pruning_method=None): self.c_users = pd.DataFrame( data=ETL(DATA_PATH, CONFIG_PATH, SCHEMA_PATH).select_complaints_users_from_db(), columns=['COMPLAINT_ID', 'COMPLAINT_TEXT', 'PRODUCT_ID']) p_id = self.pruning_product_list(pruning_method=pruning_method) self.dataset = self.pruning_data_set(p_id) self.update_mapping_new_product_id() texts = self.clean_text() self.nnds = NNetDS() self.prepare_tf_data_set(texts) self.nnds.embedding_matrix = EmbeddingGlove( MAX_WORDS=MAX_WORDS, MAX_SEQ_LENGTH=MAX_SEQ_LENGTH, GLOV_EMBEDDING_DIM=GLOV_EMBEDDING_DIM, word_index=self.nnds.word_index).get_matrix() self.build_tf_model()
def __init__(self): dirname = os.path.dirname(__file__) # Script configuration os.environ['FABRIC_URL'] = 'http://localhost:3000/' os.environ['STORAGE_DIR'] = os.path.join(dirname, 'tmp') os.environ['USER_LIST'] = os.path.join(dirname, 'data/user.csv') os.environ['USER_STORAGE'] = os.path.join(os.getenv('STORAGE_DIR'), 'user.csv') os.environ['HOLIDAY_CALENDAR'] = os.path.join( dirname, 'data/thHoliday2563-64.csv') # Import Grouped command self.user = User() self.fabric = Fabric() self.service = Service() self.bid = Bid() self.etl = ETL()
def __init__(self, name, question, db_table, row_handler, answer_cql): self._name = name self._question = question self._db_table = db_table self._answer_cql = answer_cql self._etl = ETL( f"{name} ETL", [helper.join_path(TEMP_DIR, 'staging.csv')], { "target": 'Cassandra', "table": db_table["table_name"], "is_file": False }, [i[0] for i in db_table["cols"]], row_handler, ) logger.info(f"{self._name} - Question: {self._question}")
def main(): """The `main` function, to be called from commandline, like `python src/main.py -c etl.cfg`. Args: -c --config <config_file> the Stetl config file. -s --section <section_name> the section in the Stetl config (ini) file to execute (default is [etl]). -a --args <arglist> substitutable args for symbolic, {arg}, values in Stetl config file, in format "arg1=foo arg2=bar" etc. -d --doc <class> Get component documentation like its configuration parameters, e.g. stetl --doc stetl.inputs.fileinput.FileInput -h --help get help info """ args = parse_args() if args.config_file: # Do the ETL etl = ETL(vars(args), args.config_args) etl.run() elif args.doc_args: print_doc(args.doc_args) else: print('Unknown option, try stetl -h for help')
def test_blacklist(self): etl = ETL() etl.config['plugins'] = [ 'enhance_entity_linking', 'enhance_extract_law' ] etl.config['raise_pluginexception'] = True data = {} data['content_txt'] = "\n".join(["No clause for law code alias CC"]) parameters, data = etl.process( parameters={'id': 'test_enhance_extract_law'}, data=data) self.assertFalse('Swiss Civil Code' in data['law_code_ss']) data['content_txt'] = "\n".join([ "No clause for blacklisted law code alias CC but not blacklisted label of this alias: Swiss Civil Code" ]) parameters, data = etl.process( parameters={'id': 'test_enhance_extract_law'}, data=data) self.assertTrue('Swiss Civil Code' in data['law_code_ss'])
def detect_sim(self): """ Detect crash dump similarity and output the comparison result. """ message = [] order_pair, block_pair = [], [] for param in self.params: # parameter is test_id if re.match(r"^\d{9,}$", param): dump = ETL().extract_cdb(param) processed = Process(dump).internal_process() # parameter is dump_path else: with open(param, "r", encoding="utf-8") as fp: dump = fp.read() processed = Process(dump).pre_process() cpnt_order, func_block = Knowledge(processed).add_knowledge() message.extend([cpnt_order, func_block]) order_pair.append(cpnt_order) block_pair.append(func_block) # output dump comparison Log().dump_print(message) Calculate(order_pair, block_pair).calculate_sim(debug=True)
def test(self): etl = ETL() etl.config['plugins'] = [ 'enhance_entity_linking', 'enhance_extract_law' ] etl.config['raise_pluginexception'] = True data = {} data['content_txt'] = "\n".join([ "abc § 888 xyz" "abc § 987 b xyz" "§12", "§ 123", "§345a", "§456 b", "§ 567 c", "BGB § 153 Abs. 1 Satz 2", "§ 52 Absatz 1 Nummer 2 Buchstabe c STGB", "§ 444 CC" ]) # run ETL of test.pdf with configured plugins and PDF OCR (result of etl_file.py) parameters, data = etl.process( parameters={'id': 'test_enhance_extract_law'}, data=data) self.assertTrue('§ 888' in data['law_clause_ss']) self.assertTrue('§ 987 b' in data['law_clause_ss']) self.assertTrue('§ 12' in data['law_clause_ss']) self.assertTrue('§ 123' in data['law_clause_ss']) self.assertTrue('§ 345a' in data['law_clause_ss']) self.assertTrue('§ 456 b' in data['law_clause_ss']) self.assertTrue('§ 567 c' in data['law_clause_ss']) self.assertTrue('§ 153 Abs. 1 Satz 2' in data['law_clause_ss']) self.assertTrue( '§ 52 Absatz 1 Nummer 2 Buchstabe c' in data['law_clause_ss']) self.assertTrue('Strafgesetzbuch' in data['law_code_ss']) self.assertTrue('Bürgerliches Gesetzbuch' in data['law_code_ss']) self.assertTrue('Swiss Civil Code' in data['law_code_ss'])
def __init__(self): self.etl = ETL()
#!/usr/bin/python # -*- coding: utf-8 -*- from etl import ETL if __name__ == "__main__": etl = ETL() # monthly etl.check_monthly_ranking() etl.check_new_actress() # daily etl.check_new_works()
def get_product_name(self, text): array_text = self.clean_text(text) new_id = self.get_new_product_id(array_text) return ETL(DATA_PATH, CONFIG_PATH, SCHEMA_PATH).select_product_name(new_id)
def test_init(self): dl = ETL()
def process(self, parameters=None, data=None): if parameters is None: parameters = {} if data is None: data = {} verbose = False if 'verbose' in parameters: if parameters['verbose']: verbose = True if 'id' in data: docid = data['id'] else: docid = parameters['id'] # default classifier classifier = 'en_core_web_sm' if 'spacy_ner_classifier_default' in parameters: classifier = parameters['spacy_ner_classifier_default'] # set language specific classifier, if configured and document language detected if 'spacy_ner_classifiers' in parameters and 'language_s' in data: # is a language speciic cassifier there for the detected language? if data['language_s'] in parameters['spacy_ner_classifiers']: classifier = parameters['spacy_ner_classifiers'][ data['language_s']] analyse_fields = ['content_txt', 'ocr_t', 'ocr_descew_t'] text = '' for field in analyse_fields: if field in data: text = "{}{}\n".format(text, data[field]) # extract sentences from text url = "http://localhost:8080/sents" if os.getenv('OPEN_SEMANTIC_ETL_SPACY_SERVER'): url = os.getenv('OPEN_SEMANTIC_ETL_SPACY_SERVER') + '/sents' headers = {'content-type': 'application/json'} d = {'text': text, 'model': classifier} response = requests.post(url, data=json.dumps(d), headers=headers) sentences = response.json() etl = ETL() sentencenumber = 0 for sentence in sentences: sentencenumber += 1 partdocid = docid + '#sentence' + str(sentencenumber) partparameters = parameters.copy() partparameters['plugins'] = [ 'enhance_path', 'enhance_detect_language_tika_server', 'enhance_entity_linking', 'enhance_multilingual' ] if 'enhance_ner_spacy' in parameters['plugins']: partparameters['plugins'].append('enhance_ner_spacy') if 'enhance_ner_stanford' in parameters['plugins']: partparameters['plugins'].append('enhance_ner_stanford') sentencedata = {} sentencedata['id'] = partdocid sentencedata['container_s'] = docid if 'author_ss' in data: sentencedata['author_ss'] = data['author_ss'] sentencedata['content_type_group_ss'] = "Sentence" sentencedata['content_type_ss'] = "Sentence" sentencedata['content_txt'] = sentence # index sentence try: partparameters, sentencedata = etl.process( partparameters, sentencedata) except BaseException as e: sys.stderr.write("Exception adding sentence {} : {}".format( sentencenumber, e)) data['sentences_i'] = sentencenumber return parameters, data
Args: df (object): Pandas DataFrame Returns: df (object): Pandas DataFrame with median filled NaNs ''' categoricals = [] for col in list(df.columns): if df[col].dtype.name in ['object', 'category']: categoricals.append(col) for col in list(df.columns): if col not in categoricals: df[col] = df[col].fillna(df[col].median()) return df if __name__ == "__main__": path = '../data/' d = Data(path) d.get_data() df = d.clean_data() fe = Feature_Extractor() training_data = fe.extract_features(df, verbose=True) utils.save_csv(training_data, path, 'training_data') etl = ETL(connection, data_path, schema_path, engine, df_to_write=training_data, table_name="training_data", remove=False, create=False, load=False, verbose=True) etl.pipeline()
def etl_graph(self, parameters): # Print infos if self.verbose: print("Graph has {} triples.".format(len(self.graph))) count = 0 part_parameters = {} part_parameters['plugins'] = [] # todo like enhance_path for properties & subjects? # abstract variable of enhance_path plugin? part_parameters['export'] = parameters['export'] # since there can be multiple triples/values for same property, # do not overwrite but add value to existent values of the facet/field/property part_parameters['add'] = True for subj, pred, obj in self.graph: part_data = {} part_data['content_type'] = 'Knowledge graph' count += 1 if self.verbose: print("Importing triple {}".format(count)) try: # subject as URI/ID part_parameters['id'] = subj if self.verbose: print("ID (RDF subject): {}".format(subj)) # # Predicate/property to facet/field # rdf_property = pred # set Solr datatype so facets not available yet in Solr schema can be inserted automatically (dynamic fields) with right datatype facet = rdf_property + '_ss' if self.verbose: print("Facet: {}".format(facet)) # # object to facet/field value # value = self.get_labels_from_rdfobject(obj) # insert or append value (object of triple) to data part_data[facet] = value # # Property statistics # # add to facet property where you can see which properties are available part_data['property_ss'] = pred # todo: set parameter to add instead of update for multiple triples/values for/with same property etl = ETL() etl.verbose = self.verbose # index triple etl.process(part_parameters, part_data) except KeyboardInterrupt: raise KeyboardInterrupt except BaseException as e: sys.stderr.write("Exception while triple {}: {}\n".format( count, e))
def etl_graph(self, parameters): if self.verbose: print("Graph has {} triples.".format(len(self.graph)) ) count_triple = 0 count_subjects = 0 part_parameters = {} part_parameters['plugins'] = [] part_parameters['export'] = parameters['export'] property2facet = {} if 'property2facet' in parameters: property2facet = parameters['property2facet'] etl_processor = ETL() etl_processor.verbose = self.verbose class_properties = [] class_properties.append(rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#type')) class_properties.append(rdflib.term.URIRef(u'http://www.wikidata.org/prop/direct/P31')) # since there can be multiple triples/values for same property, # do not overwrite document but add value to existent document & values of the facet/field/property part_parameters['add'] = True # but not for the field content_type which doesn't change and is not multi valued part_parameters['fields_set'] = "content_type" # use SPARQL query with distinct to get subjects only once res = self.graph.query( """SELECT DISTINCT ?subject WHERE { ?subject ?predicate ?object . }""") for row in res: count_subjects += 1 if self.verbose: print( "Importing entity / subject {}".format(count_subjects) ) # get subject of the concept from first column subj = row[0] if self.verbose: print ( "Processing RDF subject {}".format(subj) ) part_data = {} part_data['content_type'] = 'Knowledge graph' part_data['content_type_group'] = 'Knowledge graph' # subject as URI/ID part_parameters['id'] = subj preferred_label = self.get_preferred_label(subject=subj) part_data['title'] = preferred_label count_subject_triple = 0 # get all triples for this subject for pred, obj in self.graph.predicate_objects(subject=subj): count_triple += 1 count_subject_triple += 1 if self.verbose: print( "Importing subjects triple {}".format(count_subject_triple) ) print( "Predicate / property: {}".format(pred) ) print( "Object / value: {}".format(obj) ) try: # if class add preferredlabel of this entity to facet of its class (RDF rdf:type or Wikidata "instance of" (Property:P31)), # so its name (label) will be available in entities view and as filter for faceted search if pred in class_properties: class_facet = str(obj) # map class to facet, if mapping for class exist if class_facet in property2facet: class_facet = property2facet[class_facet] etl.append(data=part_data, facet=class_facet, values=preferred_label) # # Predicate/property to facet/field # # set Solr datatype strings so facets not available yet in Solr schema can be inserted automatically (dynamic fields) with right datatype facet = pred + '_ss' facet_uri = facet + '_uri_ss' facet_preferred_label_and_uri = facet + '_preflabel_and_uri_ss' if self.verbose: print ( "Facet: {}".format(facet) ) # # get values or labels of this object # values = self.get_values(obj=obj) if self.verbose: print ( "Values: {}".format(values) ) # insert or append value (object of triple) to data etl.append(data=part_data, facet=facet, values=values) # if object is reference/URI append URI if type(obj) == rdflib.URIRef: uri = obj etl.append( data=part_data, facet=facet_uri, values=uri ) # append mixed field with preferred label and URI of the object for disambiguation of different Entities/IDs/URIs with same names/labels in faceted search preferredlabel_and_uri = "{} <{}>".format ( self.get_preferred_label(subject=obj), obj) else: preferredlabel_and_uri = self.get_preferred_label(subject=obj) etl.append(data=part_data, facet=facet_preferred_label_and_uri, values=preferredlabel_and_uri) except KeyboardInterrupt: raise KeyboardInterrupt except BaseException as e: sys.stderr.write( "Exception while triple {} of subject {}: {}\n".format(count_subject_triple, subj, e) ) # index triple etl_processor.process( part_parameters, part_data)
def process(self, parameters=None, data=None): if parameters is None: parameters = {} if data is None: data = {} verbose = False if 'verbose' in parameters: if parameters['verbose']: verbose = True if 'id' in data: docid = data['id'] else: docid = parameters['id'] # default classifier classifier = 'en_core_web_sm' if 'spacy_ner_classifier_default' in parameters: classifier = parameters['spacy_ner_classifier_default'] # set language specific classifier, if configured and document language detected if 'spacy_ner_classifiers' in parameters and 'language_s' in data: # is a language speciic cassifier there for the detected language? if data['language_s'] in parameters['spacy_ner_classifiers']: classifier = parameters['spacy_ner_classifiers'][data['language_s']] analyse_fields = ['content_txt', 'ocr_t', 'ocr_descew_t'] text = '' for field in analyse_fields: if field in data: text = "{}{}\n".format(text, data[field]) # extract sentences from text url = "http://localhost:8080/sents" if os.getenv('OPEN_SEMANTIC_ETL_SPACY_SERVER'): url = os.getenv('OPEN_SEMANTIC_ETL_SPACY_SERVER') + '/sents' headers = {'content-type': 'application/json'} d = {'text': text, 'model': classifier} retries = 0 retrytime = 1 # wait time until next retry will be doubled until reaching maximum of 120 seconds (2 minutes) until next retry retrytime_max = 120 no_connection = True while no_connection: try: if retries > 0: print( 'Retrying to connect to Spacy services in {} second(s).'.format(retrytime)) time.sleep(retrytime) retrytime = retrytime * 2 if retrytime > retrytime_max: retrytime = retrytime_max response = requests.post(url, data=json.dumps(d), headers=headers) # if bad status code, raise exception response.raise_for_status() no_connection = False except requests.exceptions.ConnectionError as e: retries += 1 sys.stderr.write( "Connection to Spacy services (will retry in {} seconds) failed. Exception: {}\n".format(retrytime, e)) sentences = response.json() etl = ETL() sentencenumber = 0 for sentence in sentences: sentencenumber += 1 partdocid = docid + '#sentence' + str(sentencenumber) partparameters = parameters.copy() partparameters['plugins'] = ['enhance_path', 'enhance_detect_language_tika_server', 'enhance_entity_linking', 'enhance_multilingual'] if 'enhance_ner_spacy' in parameters['plugins']: partparameters['plugins'].append('enhance_ner_spacy') if 'enhance_ner_stanford' in parameters['plugins']: partparameters['plugins'].append('enhance_ner_stanford') sentencedata = {} sentencedata['id'] = partdocid sentencedata['container_s'] = docid if 'author_ss' in data: sentencedata['author_ss'] = data['author_ss'] sentencedata['content_type_group_ss'] = "Sentence" sentencedata['content_type_ss'] = "Sentence" sentencedata['content_txt'] = sentence # index sentence try: partparameters, sentencedata = etl.process( partparameters, sentencedata) except BaseException as e: sys.stderr.write( "Exception adding sentence {} : {}".format(sentencenumber, e)) data['sentences_i'] = sentencenumber return parameters, data
parser.add_argument("--crawl", nargs="?", const=True, help="Crawling recent crash dumps.") parser.add_argument("--train", nargs="?", const=True, help="Training for parameter tuning.") parser.add_argument("--stop", nargs="?", const=True, help="Count file names that can be filtered.") parser.add_argument("--detect", nargs=2, help="Detect crash dump similarity.") args = parser.parse_args() # suppress warnings urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) if __name__ == "__main__": # crawling recent crash dumps if args.crawl: ETL().load() # training for parameter tuning if args.train: Train().training() # count file names that can be filtered if args.stop: StopWord().count_word() # detect crash dump similarity if args.detect: Detect(args.detect).detect_sim()
weight_decay=args.rmsprop_decay) g_losses = np.empty(0) print("Initializing discriminator model and optimizer.") d_net = Discriminator().cuda() d_opt = optim.RMSprop(d_net.parameters(), args.learning_rate_d, weight_decay=args.rmsprop_decay) d_losses = np.empty(0) if args.retrain: g_net.load_state_dict(torch.load('../data/generator_state')) d_net.load_state_dict(torch.load('../data/discriminator_state')) print("Beginning training..") loader = ETL(args.batch_size, args.image_size, args.path) for iteration in range(args.iterations): # Train discriminator for _ in range(args.k_discriminator): d_opt.zero_grad() d_examples, d_targets = loader.next_batch() d_noise = torch.Tensor(args.batch_size, 1, args.image_size, args.image_size).uniform_(-1., 1.) d_noise = Variable(d_noise).cuda() d_samples = g_net(d_noise, d_examples).detach() d_real_pred = d_net(d_targets) d_fake_pred = d_net(d_samples)
class Connector_Hypothesis(ETL): verbose = False documents = True token = None api = 'https://hypothes.is/api/' # how many annotations to download at once / per page limit = 10 # initialize Open Semantic ETL etl = ETL() etl.read_configfile('/etc/etl/config') etl.read_configfile('/etc/opensemanticsearch/etl') etl.read_configfile('/etc/opensemanticsearch/hypothesis') etl.verbose = verbose exporter = export_solr.export_solr() # # index the annotated document, if not yet in index # def etl_document(self, uri): result = True doc_mtime = self.exporter.get_lastmodified(docid=uri) if doc_mtime: if self.verbose: print( "Annotated document in search index. No new indexing of {}" .format(uri)) else: # Download and Index the new or updated uri if self.verbose: print( "Annotated document not in search index. Start indexing of {}" .format(uri)) try: etl = Connector_Web() etl.index(uri=uri) except KeyboardInterrupt: raise KeyboardInterrupt except BaseException as e: sys.stderr.write("Exception while getting {} : {}".format( uri, e)) result = False return result # # import an annotation # def etl_annotation(self, annotation): parameters = {} parameters['plugins'] = ['enhance_multilingual'] # since there can be multiple annotations for same URI, # do not overwrite but add value to existent values of the facet/field/property parameters['add'] = True data = {} # id/uri of the annotated document, not the annotation id parameters['id'] = annotation['uri'] # first index / etl the webpage / document that has been annotated if not yet in index if self.documents: result = self.etl_document(uri=annotation['uri']) if not result: data[ 'etl_error_hypothesis_ss'] = "Error while indexing the document that has been annotated" # annotation id data['annotation_id_ss'] = annotation['id'] data['annotation_text_txt'] = annotation['text'] tags = [] if 'tags' in annotation: if self.verbose: print("Tags: {}".format(annotation['tags'])) for tag in annotation['tags']: tags.append(tag) data['annotation_tag_ss'] = tags # write annotation to database or index self.etl.process(parameters=parameters, data=data) # # import all annotations since last imported annotation # def etl_annotations(self, last_update="", user=None, group=None, tag=None, uri=None): newest_update = last_update if not self.api.endswith('/'): self.api = self.api + '/' searchurl = '{}search?limit={}&sort=updated&order=desc'.format( self.api, self.limit) if user: searchurl += "&user={}".format(user) if group: searchurl += "&group={}".format(group) if tag: searchurl += "&tag={}".format(tag) if uri: searchurl += "&uri={}".format(uri) # Authorization headers = {'user-agent': 'Open Semantic Search'} if self.token: headers['Authorization'] = 'Bearer ' + self.token # stats stat_downloaded_annotations = 0 stat_imported_annotations = 0 stat_pages = 0 offset = 0 last_page = False while not last_page: searchurl_paged = searchurl + "&offset={}".format(offset) # Call API / download annotations if self.verbose: print("Calling hypothesis API {}".format(searchurl_paged)) request = requests.get(searchurl_paged, headers=headers) result = json.loads(request.content.decode('utf-8')) stat_pages += 1 if len(result['rows']) < self.limit: last_page = True # import annotations for annotation in result['rows']: stat_downloaded_annotations += 1 if annotation['updated'] > last_update: if self.verbose: print( "Importing new annotation {}annotations/{}".format( self.api, annotation['id'])) print(annotation['text']) stat_imported_annotations += 1 # save update time from newest annotation/edit if annotation['updated'] > newest_update: newest_update = annotation['updated'] self.etl_annotation(annotation) else: last_page = True offset += self.limit # commit to index, if yet buffered self.etl.commit() if self.verbose: print("Downloaded annotations: {}".format( stat_downloaded_annotations)) print("Imported new annotations: {}".format( stat_imported_annotations)) return newest_update
from etl import ETL if __name__ == '__main__': etl = ETL(url='mongodb://localhost:27017/', db_name='sbp') etl.run()