Example #1
0
class Setup:
    def __init__(self, config: Config):
        self.sql_client = SQLClient(config)
        self.es_client = ES_Client(config)

    def populate_index_from_mysql(self):
        rows = self.sql_client.fetch_all_bbc()
        for row in rows:
            body = {'tags': row[2], 'content': row[1]}
            self.es_client.index_es(row[0], body)
Example #2
0
 def __init__(self, config: Config, aws_access_key_id,
              aws_secret_access_key, bucket, region, index):
     self.pdfUtil = PDFUtil()
     self.esClient = ES_Client(config)
     self.sqlClient = SQLClient(config)
     self.automatedTags = AutomatedTags()
     self.s3Client = S3Client(aws_access_key_id, aws_secret_access_key,
                              bucket, region)
     self.bucket = bucket
     self.index = index
     self.esClient.set_index(index.replace(' ', '').lower())
Example #3
0
def setup():
    global connectionHandler
    global es_client
    global sql_client
    global graph_handler
    global config
    config = Config('./config/config.yaml')
    connectionHandler = ConnectionHandler(config)
    es_client = ES_Client(config)
    sql_client = SQLClient(config)
    graph_handler = GraphHandler(config)
Example #4
0
class S3FileProcessor:
    def __init__(self, config: Config, aws_access_key_id,
                 aws_secret_access_key, bucket, region, index):
        self.pdfUtil = PDFUtil()
        self.esClient = ES_Client(config)
        self.sqlClient = SQLClient(config)
        self.automatedTags = AutomatedTags()
        self.s3Client = S3Client(aws_access_key_id, aws_secret_access_key,
                                 bucket, region)
        self.bucket = bucket
        self.index = index
        self.esClient.set_index(index.replace(' ', '').lower())

    def get_tag_string(self, text):
        tags = self.automatedTags.get_tags(text)
        tag_array = []
        tag_string = ''
        regex = re.compile('[@_!#$%^&*()<>?/\|}{~:°"’”.]')
        if tags is not None:
            for tag in tags:
                if regex.search(tag) is None:
                    tag_array.append(tag)
        tags = list(dict.fromkeys(tag_array))
        for tag in tags:
            tag_string = tag_string + ',' + tag
        if tag_string is not None:
            tag_string = tag_string[1:]
        return tag_string

    def remove_tag_duplicates(self, tag_string):
        tags = tag_string.split(',')
        tags = list(dict.fromkeys(tags))
        tag_str = ''
        for tag in tags:
            tag_str = tag_str + ',' + tag
        if tag_str is not None:
            tag_str = tag_str[1:]
        return tag_str

    def process_pdf(self, body, key):
        text = self.pdfUtil.pdf_to_text(body)
        metadata = self.pdfUtil.get_metadata(body)
        print(text)
        es_body = ''
        tag_string = ''
        if text is not None:
            tag_string = self.get_tag_string(text)
            es_body = {
                'content': text,
                'automated_tags': tag_string,
                'tags': '',
                'summary': self.get_summary(text)
            }
            for obj in metadata[0]:
                try:
                    es_body[obj] = (metadata[0][obj]).decode("utf-8")
                except:
                    continue
        self.esClient.index_es(key, es_body)
        return tag_string

    def process_doc(self, url, key):
        text = ' '.join(textract.process(url).decode('utf-8').splitlines())
        es_body = ''
        tag_string = ''
        if text is not None:
            tag_string = self.get_tag_string(text)
            es_body = {
                'content': text,
                'automated_tags': tag_string,
                'tags': '',
                'summary': self.get_summary(text)
            }
        document = Document(url)
        core_properties = document.core_properties
        metadata = [
            attr for attr in dir(core_properties)
            if not callable(getattr(core_properties, attr))
            and not attr.startswith("__") and not attr.startswith("_")
        ]
        for meta in metadata:
            try:
                if getattr(core_properties, meta):
                    es_body[meta] = str(getattr(core_properties, meta))
            except:
                continue
        print(self.esClient.index_es(key, es_body))
        return tag_string

    def process_text(self, body, key):
        text = ' '.join(body.decode('utf-8').splitlines())
        es_body = ''
        tag_string = ''
        if text is not None:
            tag_string = self.get_tag_string(text)
            es_body = {
                'content': text,
                'automated_tags': tag_string,
                'tags': '',
                'summary': self.get_summary(text)
            }
        print(self.esClient.index_es(key, es_body))
        return tag_string

    def add_to_unique_tag_list(self, tag_string):
        tag_string = tag_string.replace(',', ' ')
        self.sqlClient.insert_into_tags(self.index, tag_string)

    def read_bucket(self):
        keys = self.s3Client.get_s3_keys(self.bucket)
        tag_string = ""
        for key in keys:
            try:
                if key.endswith(".pdf"):
                    body = self.s3Client.get_s3_file_body(key)
                    tag_string = tag_string + "," + self.process_pdf(body, key)
                if key.endswith(".docx"):
                    self.s3Client.download_file(self.bucket, key)
                    tag_string = tag_string + "," + self.process_doc(
                        '/tmp/' + key, key)
                if key.endswith(".txt"):
                    body = self.s3Client.get_s3_file_body(key)
                    tag_string = tag_string + "," + self.process_text(
                        body, key)
            except:
                print("Skipping processing for this file")
        try:
            tag_string = self.remove_tag_duplicates(tag_string)
            self.add_to_unique_tag_list(tag_string)
        except:
            print("Error in adding tags to db")

    def get_summary(self, text):
        LANGUAGE = "english"
        SENTENCES_COUNT = 3
        parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE))
        stemmer = Stemmer(LANGUAGE)

        summarizer = Summarizer(stemmer)
        summarizer.stop_words = get_stop_words(LANGUAGE)
        summary = ""

        for sentence in summarizer(parser.document, SENTENCES_COUNT):
            summary += str(sentence)
        return summary
Example #5
0
 def __init__(self, config: Config):
     self.sql_client = SQLClient(config)
     self.es_client = ES_Client(config)
class GraphHandler:
    def __init__(self, config: Config):
        self.es_client = ES_Client(config)

    def get_all_records(self):
        return self.es_client.match_all()

    def get_nodes(self, record_array):
        nodes = []
        i = 1
        for record in record_array:
            j = dict()
            j['id'] = i
            j['label'] = record['_id']
            i = i + 1
            nodes.append(j)
        return nodes

    def get_tags_array(self, record_array):
        tags = []
        for record in record_array:
            tag = record['_source']['tags'].split(
                ',') + record['_source']['automated_tags'].split(',')
            tag.pop(0)
            tags.append(tag)
        return tags

    def get_edges(self, tags):
        edges = []
        for i in range(len(tags)):
            for j in range(i + 1, len(tags)):
                set1 = set(tags[i])
                set2 = set(tags[j])
                set3 = set1 & set2
                if bool(set3):
                    edge = [i + 1, j + 1]
                    tag_str = ""
                    for k in set3:
                        tag_str = tag_str + "," + k
                    edge.append(tag_str[1:])
                    edges.append(edge)
        return edges

    def proc_edges(self, edges):
        ed = []
        i = 1
        for edge in edges:
            j = dict()
            j['id'] = i
            j['from'] = edge[0]
            j['to'] = edge[1]
            j['label'] = edge[2]
            ed.append(j)
            i = i + 1
        return ed

    def doc_entry(self, conn_name):
        self.es_client.set_index(conn_name)
        records = self.get_all_records()
        nodes = self.get_nodes(records)
        edges = self.proc_edges(self.get_edges(self.get_tags_array(records)))
        j = dict()
        j['nodes'] = nodes
        j['edges'] = edges
        return j

    def get_tag_dict(self):
        record_array = self.get_all_records()
        tag_dict = dict()
        i = 1
        for record in record_array:
            tag = record['_source']['tags'].split(
                ',') + record['_source']['automated_tags'].split(',')
            tag.pop(0)
            for t in tag:
                if t not in tag_dict.keys():
                    tag_dict[t] = record['_id']
                else:
                    docs = tag_dict[t]
                    tag_dict[t] = docs + "$$" + record['_id']
        return tag_dict

    def process_tag_dict(self, tag_dict):
        nodes_list = []
        nodes = dict()
        edges = []
        i = 1
        k = 1
        for key in tag_dict:
            if key not in nodes:
                nodes[key] = i
                from_id = i
                j = dict()
                j['id'] = i
                j['label'] = key
                nodes_list.append({"id": nodes[key], "label": key})
                i = i + 1
            doc_list = tag_dict[key].split("$$")
            if len(doc_list) == 1:
                if doc_list[0] not in nodes:
                    nodes[doc_list[0]] = i
                    to_id = i
                    j = dict()
                    j['id'] = i
                    j['label'] = doc_list[0]
                    nodes_list.append(j)
                    i = i + 1
                else:
                    to_id = nodes[doc_list[0]]
                j = dict()
                j['id'] = k
                j['from'] = from_id
                j['to'] = to_id
                j['label'] = ''
                edges.append(j)
                k = k + 1
            else:
                for doc in doc_list:
                    if doc not in nodes:
                        nodes[doc] = i
                        to_id = i
                        j = dict()
                        j['id'] = i
                        j['label'] = doc
                        nodes_list.append(j)
                        i = i + 1
                    else:
                        to_id = nodes[doc]
                    j = dict()
                    j['id'] = k
                    j['from'] = from_id
                    j['to'] = to_id
                    j['label'] = ''
                    edges.append(j)
                    k = k + 1
        j = dict()
        j['nodes'] = nodes_list
        j['edges'] = edges
        return j

    def tag_entry(self, conn_name):
        print(conn_name)
        self.es_client.set_index(conn_name)
        tag_dict = self.get_tag_dict()
        return self.process_tag_dict(tag_dict)