Ejemplo n.º 1
0
    def test_similar_docs_complex(self):
        doc1 = Document()
        doc2 = Document()
        doc1.title = "Fred Phelps, Head Of Westboro Baptist Church, Dies"
        doc1.content = "Fred Phelps, anti-gay activist and patriarch of the Westboro Baptist Church, has died at age 84. Frank Morris of KCUR reports on the interesting past of one of the most reviled men in America. "
        doc2.title = "Westboro Baptist Church Says Leader Fred Phelps 'Has Gone The Way of All Flesh'"
        doc2.content = "The Bible-thumping, anti-gay preacher was known for picketing funerals."
        distance = self.state.repository.clustering.comparator.similarity(doc1, doc2)
        for word in doc1.words() & doc2.words():
            logging.info("tf_idf of \"%s\": %2.5f", word, self.state.repository.index.tf_idf(word))

        for word in doc1.words() ^ doc2.words():
            logging.info("tf_idf of non intersection \"%s\": %2.5f", word, self.state.repository.index.tf_idf(word))
        logging.info("Similarity is %2.2f", distance)
        self.assertGreater(distance, 0.15)
Ejemplo n.º 2
0
    def ingest(self,
               dataset_name,
               dataset_source,
               dataset_description,
               dataset_author,
               dataset_notes,
               dataset_creation_time,
               dataset_tags,
               online=True):
        """
        The following will clean, parse, and upload datasets to our database.

        :param dataset_name: Name of the dataset.
        :param dataset_source: Source of the dataset (i.e. filename or URL).
        :param dataset_description: Description of the dataset.
        :param dataset_author: Author of the dataset.
        :param dataset_notes: Any notes on the dataset by us.
        :param dataset_creation_time: Time the dataset was created.
        :param online: boolean of whether the data is a local file (offline) or a URL (online).
        """
        if CSVParser.is_csv(dataset_source):
            if online:
                raw_documents = CSVParser.convert_csv_url_to_json_list(
                    dataset_source)
            else:
                raw_documents = CSVParser.convert_csv_file_to_json_list(
                    dataset_source)
            dataset_attributes = raw_documents[0].keys()
            es_documents = [
                Document(dataset_name, raw_document).get_es_document()
                for raw_document in raw_documents
            ]
            self.es.bulk_upload(es_documents)
        else:
            print("Unsupported file format.")

        metadocument = {
            "dataset_name": dataset_name,
            "dataset_description": dataset_description,
            "dataset_notes": dataset_notes,
            "dataset_keywords":
            None,  # TODO: Add explicit keywords for datasets through ML
            "dataset_tags": dataset_tags,
            "dataset_author": dataset_author,
            "time_ingested": calendar.timegm(time.gmtime()),
            "time_created": dataset_creation_time,
            "dataset_source": dataset_source,
            "dataset_attributes": dataset_attributes,
            "dataset_num_docs": len(es_documents),
        }
        self.es.bulk_upload(
            [Metadocument(metadocument, dataset_name).get_es_document()])
Ejemplo n.º 3
0
 def test_very_similar_docs(self):
     doc1 = Document()
     doc2 = Document()
     doc1.title = "russia invades ukraine"
     doc1.content = "russia invades ukraine"
     doc2.title = "russia invaded by ukraine"
     doc2.content = "russia invaded ukraine"
     distance = self.state.repository.clustering.comparator.similarity(doc1, doc2)
     logging.info("Similarity is %2.2f", distance)
     self.assertGreater(distance, 0.3)
Ejemplo n.º 4
0
    def create_from_docs(self, docs_json):
        # time and log
        start = time.time()
        self.logger.info("Creating documents...")

        # init variables
        self.docs = [None] * len(docs_json)

        # load documents and tokenize
        for i, key in enumerate(docs_json.keys()):
            progbar(i, len(self.docs), 20)
            doc = Document(int(key), docs_json[key])
            self.docs[int(key)] = doc

        end = time.time()
        self.logger.info("Creating document complete. elapsed time: " +
                         str(end - start) + " secs")
Ejemplo n.º 5
0
 def test_somewhat_different_docs(self):
     doc1 = Document()
     doc2 = Document()
     doc1.title = "russia bought cars in portugal"
     doc1.content = "russia went to portugal and bought 5 new cars"
     doc2.title = "portugal is cool"
     doc2.content = "portugal is very very cool"
     distance = self.state.repository.clustering.comparator.similarity(doc1, doc2)
     logging.info("Similarity is %2.2f", distance)
     logging.info("tf_idf of \"is\": %2.5f", self.state.repository.index.tf_idf("is"))
     logging.info("tf_idf of \"portugal\": %2.5f", self.state.repository.index.tf_idf("portugal"))
     logging.info("tf_idf of \"russia\": %2.5f", self.state.repository.index.tf_idf("russia"))
     self.assertLess(distance, 0.15)
Ejemplo n.º 6
0
 def test_very_different_docs(self):
     doc1 = Document()
     doc2 = Document()
     doc1.title = "russia invades ukraine"
     doc1.content = "russia is invading ukraine again"
     doc2.title = "portugal is cool"
     doc2.content = "portugal is very very cool"
     distance = self.state.repository.clustering.comparator.similarity(doc1, doc2)
     logging.info("Similarity is %2.2f", distance)
     logging.info("tf_idf of \"is\": %2.5f", self.state.repository.index.tf_idf("is"))
     logging.info("tf_idf of \"portugal\": %2.5f", self.state.repository.index.tf_idf("portugal"))
     logging.info("tf_idf of \"ukraine\": %2.5f", self.state.repository.index.tf_idf("ukraine"))
     self.assertLess(distance, 0.05)
Ejemplo n.º 7
0
    def load_docs(self, fname, nrows=None):
        cntr = 0
        with open(fname, 'r') as f:
            for line in f:
                idx, text = line.split("\t")
                idx = int(idx)
                doc = Document('')
                doc.set_tokens(text)
                doc.set_id(idx)
                self.docs_id[idx] = doc

                cntr += 1
                if nrows is not None:
                    if cntr == nrows:
                        break
Ejemplo n.º 8
0
    def get_docs_from_xml(self, root):
        docs = []
        for channel in root:
            for item in channel.findall("item"):
                new_doc = Document()
                new_doc.title = item.find("title").text or ""

                new_doc.download_date = datetime.now(tz.tzutc())
                new_doc.publish_date = dateparser.parse(item.find("pubDate").text, "") or new_doc.download_date
                if new_doc.publish_date.tzinfo is None or self.force_timezone:
                    new_doc.publish_date=new_doc.publish_date.replace(tzinfo=self.timezone)
                new_doc.publish_date = new_doc.publish_date.astimezone(tz.tzutc())

                new_doc.source_url = item.find("link").text or ""

                new_doc.original_summary = strip_html(item.find("description").text or "")

                if item.find("guid"):
                    new_doc.guid = hashlib.md5(item.find("guid").encode('utf-8')).hexdigest()
                else:
                    new_doc.guid = hashlib.md5(new_doc.source_url.encode('utf-8')).hexdigest()
                new_doc.provider = self.name

                if new_doc.guid not in self.processed_guids:
                    self.processed_guids[new_doc.guid] = True
                    self.document_count += 1
                    docs.append(new_doc)

        return docs
def assign_text_to_speaker(body, doc_graph):
    """ Fills values of dictionary with speakers role, job, and respective text. 
    The full call transcript is populated throughout the values of the dictionary.
    """

    sections = subdivide(body, "^=+")
    # regex pattern for matching headers of each section
    header_pattern = re.compile("^.*[^\n]", re.MULTILINE)

    # regex pattern for matching the sections that contains
    # the list of attendee's (those that start with asterisks )
    #if unicode("Corporate Participants", "utf-8") in sections:
    ppl_pattern = re.compile("^(\s+\*)(.+)(\s.*)", re.MULTILINE)
    #else:
    #    ppl_pattern = re.compile("^(\s+\*)(\s.*)", re.MULTILINE)

    # regex pattern for matching sections with subsections in them.
    dash_pattern = re.compile("^-+", re.MULTILINE)

    ppl_d = dict()
    #ppl_d['Operator'] = ['Operator', 'Operator']
    talks_d = dict()

    header = []
    # Step2. Handle each section like a switch case
    for section in sections:
        # Handle headers
        if len(section.split(
                '\n')) == 1:  # likely to match only a header (assuming )
            header = header_pattern.match(section).string

        # Handle attendees/presenters
        elif ppl_pattern.match(section):
            #if unicode("Corporate Participants", "utf-8") in sections:
            ppls = ppl_pattern.findall(section)
            d = {key.strip(): value.strip() for (_, key, value) in ppls}
            #else:
            #    ppls = ppl_pattern.findall(section)
            #    ppls_list = []
            #    for i in ppls:
            #        val = unicode('particiapnt', 'utf-8')
            #        ppls_new = i + (val,)
            #        ppls_list.append(ppls_new)
            #    d = {key.strip(): value.strip() for (_, key, value) in ppls_list}

            # assuming that if the previous section was detected as a header, then this section will relate
            # to that header
            if header:
                for key, value in d.items():
                    d[key] = [value, header]
            ppl_d.update(d)

        # Handle Presentations/Q&A subsections
        elif dash_pattern.findall(section):
            heading, d = process_dashed_sections(section)
            talks_d.update({heading: d})
            for speaker, text in d.items():
                if 'operator' in speaker.lower():
                    continue
                else:
                    doc = Document(text=text)
                    doc_graph.add_node(doc, text)

        # Else its just some random text.
        else:

            # assuming that if the previous section was detected as a header, then this section will relate
            # to that header
            if header:
                talks_d.update({header: section})

    # To assign the talks material (as a list) to the appropriate attendee/presenter. Still works if no match found.
    for key, value in talks_d.items():
        talks_d[key] = assign_attendee(value, ppl_d, doc_graph)

    return talks_d, doc_graph
Ejemplo n.º 10
0
def doc_dummy():
    return Document('erstes valid', BESCHREIBUNG, 0)