class Setup: def __init__(self, config: Config): self.sql_client = SQLClient(config) self.es_client = ES_Client(config) def populate_index_from_mysql(self): rows = self.sql_client.fetch_all_bbc() for row in rows: body = {'tags': row[2], 'content': row[1]} self.es_client.index_es(row[0], body)
def __init__(self, config: Config, aws_access_key_id, aws_secret_access_key, bucket, region, index): self.pdfUtil = PDFUtil() self.esClient = ES_Client(config) self.sqlClient = SQLClient(config) self.automatedTags = AutomatedTags() self.s3Client = S3Client(aws_access_key_id, aws_secret_access_key, bucket, region) self.bucket = bucket self.index = index self.esClient.set_index(index.replace(' ', '').lower())
def setup(): global connectionHandler global es_client global sql_client global graph_handler global config config = Config('./config/config.yaml') connectionHandler = ConnectionHandler(config) es_client = ES_Client(config) sql_client = SQLClient(config) graph_handler = GraphHandler(config)
class S3FileProcessor: def __init__(self, config: Config, aws_access_key_id, aws_secret_access_key, bucket, region, index): self.pdfUtil = PDFUtil() self.esClient = ES_Client(config) self.sqlClient = SQLClient(config) self.automatedTags = AutomatedTags() self.s3Client = S3Client(aws_access_key_id, aws_secret_access_key, bucket, region) self.bucket = bucket self.index = index self.esClient.set_index(index.replace(' ', '').lower()) def get_tag_string(self, text): tags = self.automatedTags.get_tags(text) tag_array = [] tag_string = '' regex = re.compile('[@_!#$%^&*()<>?/\|}{~:°"’”.]') if tags is not None: for tag in tags: if regex.search(tag) is None: tag_array.append(tag) tags = list(dict.fromkeys(tag_array)) for tag in tags: tag_string = tag_string + ',' + tag if tag_string is not None: tag_string = tag_string[1:] return tag_string def remove_tag_duplicates(self, tag_string): tags = tag_string.split(',') tags = list(dict.fromkeys(tags)) tag_str = '' for tag in tags: tag_str = tag_str + ',' + tag if tag_str is not None: tag_str = tag_str[1:] return tag_str def process_pdf(self, body, key): text = self.pdfUtil.pdf_to_text(body) metadata = self.pdfUtil.get_metadata(body) print(text) es_body = '' tag_string = '' if text is not None: tag_string = self.get_tag_string(text) es_body = { 'content': text, 'automated_tags': tag_string, 'tags': '', 'summary': self.get_summary(text) } for obj in metadata[0]: try: es_body[obj] = (metadata[0][obj]).decode("utf-8") except: continue self.esClient.index_es(key, es_body) return tag_string def process_doc(self, url, key): text = ' '.join(textract.process(url).decode('utf-8').splitlines()) es_body = '' tag_string = '' if text is not None: tag_string = self.get_tag_string(text) es_body = { 'content': text, 'automated_tags': tag_string, 'tags': '', 'summary': self.get_summary(text) } document = Document(url) core_properties = document.core_properties metadata = [ attr for attr in dir(core_properties) if not callable(getattr(core_properties, attr)) and not attr.startswith("__") and not attr.startswith("_") ] for meta in metadata: try: if getattr(core_properties, meta): es_body[meta] = str(getattr(core_properties, meta)) except: continue print(self.esClient.index_es(key, es_body)) return tag_string def process_text(self, body, key): text = ' '.join(body.decode('utf-8').splitlines()) es_body = '' tag_string = '' if text is not None: tag_string = self.get_tag_string(text) es_body = { 'content': text, 'automated_tags': tag_string, 'tags': '', 'summary': self.get_summary(text) } print(self.esClient.index_es(key, es_body)) return tag_string def add_to_unique_tag_list(self, tag_string): tag_string = tag_string.replace(',', ' ') self.sqlClient.insert_into_tags(self.index, tag_string) def read_bucket(self): keys = self.s3Client.get_s3_keys(self.bucket) tag_string = "" for key in keys: try: if key.endswith(".pdf"): body = self.s3Client.get_s3_file_body(key) tag_string = tag_string + "," + self.process_pdf(body, key) if key.endswith(".docx"): self.s3Client.download_file(self.bucket, key) tag_string = tag_string + "," + self.process_doc( '/tmp/' + key, key) if key.endswith(".txt"): body = self.s3Client.get_s3_file_body(key) tag_string = tag_string + "," + self.process_text( body, key) except: print("Skipping processing for this file") try: tag_string = self.remove_tag_duplicates(tag_string) self.add_to_unique_tag_list(tag_string) except: print("Error in adding tags to db") def get_summary(self, text): LANGUAGE = "english" SENTENCES_COUNT = 3 parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) summary = "" for sentence in summarizer(parser.document, SENTENCES_COUNT): summary += str(sentence) return summary
def __init__(self, config: Config): self.sql_client = SQLClient(config) self.es_client = ES_Client(config)
class GraphHandler: def __init__(self, config: Config): self.es_client = ES_Client(config) def get_all_records(self): return self.es_client.match_all() def get_nodes(self, record_array): nodes = [] i = 1 for record in record_array: j = dict() j['id'] = i j['label'] = record['_id'] i = i + 1 nodes.append(j) return nodes def get_tags_array(self, record_array): tags = [] for record in record_array: tag = record['_source']['tags'].split( ',') + record['_source']['automated_tags'].split(',') tag.pop(0) tags.append(tag) return tags def get_edges(self, tags): edges = [] for i in range(len(tags)): for j in range(i + 1, len(tags)): set1 = set(tags[i]) set2 = set(tags[j]) set3 = set1 & set2 if bool(set3): edge = [i + 1, j + 1] tag_str = "" for k in set3: tag_str = tag_str + "," + k edge.append(tag_str[1:]) edges.append(edge) return edges def proc_edges(self, edges): ed = [] i = 1 for edge in edges: j = dict() j['id'] = i j['from'] = edge[0] j['to'] = edge[1] j['label'] = edge[2] ed.append(j) i = i + 1 return ed def doc_entry(self, conn_name): self.es_client.set_index(conn_name) records = self.get_all_records() nodes = self.get_nodes(records) edges = self.proc_edges(self.get_edges(self.get_tags_array(records))) j = dict() j['nodes'] = nodes j['edges'] = edges return j def get_tag_dict(self): record_array = self.get_all_records() tag_dict = dict() i = 1 for record in record_array: tag = record['_source']['tags'].split( ',') + record['_source']['automated_tags'].split(',') tag.pop(0) for t in tag: if t not in tag_dict.keys(): tag_dict[t] = record['_id'] else: docs = tag_dict[t] tag_dict[t] = docs + "$$" + record['_id'] return tag_dict def process_tag_dict(self, tag_dict): nodes_list = [] nodes = dict() edges = [] i = 1 k = 1 for key in tag_dict: if key not in nodes: nodes[key] = i from_id = i j = dict() j['id'] = i j['label'] = key nodes_list.append({"id": nodes[key], "label": key}) i = i + 1 doc_list = tag_dict[key].split("$$") if len(doc_list) == 1: if doc_list[0] not in nodes: nodes[doc_list[0]] = i to_id = i j = dict() j['id'] = i j['label'] = doc_list[0] nodes_list.append(j) i = i + 1 else: to_id = nodes[doc_list[0]] j = dict() j['id'] = k j['from'] = from_id j['to'] = to_id j['label'] = '' edges.append(j) k = k + 1 else: for doc in doc_list: if doc not in nodes: nodes[doc] = i to_id = i j = dict() j['id'] = i j['label'] = doc nodes_list.append(j) i = i + 1 else: to_id = nodes[doc] j = dict() j['id'] = k j['from'] = from_id j['to'] = to_id j['label'] = '' edges.append(j) k = k + 1 j = dict() j['nodes'] = nodes_list j['edges'] = edges return j def tag_entry(self, conn_name): print(conn_name) self.es_client.set_index(conn_name) tag_dict = self.get_tag_dict() return self.process_tag_dict(tag_dict)