Exemple #1
0
    def get_all_neighbors(self):
        self.vectorize()

        debug.header("Identifying similar items...")

        runtime_start = dt.datetime.now()
        for i in range(len(self.items)):
            neighbors = self.get_neighbors(i)
            self.sorted_neighbors_lists.append(neighbors)
            self.neighbors_list_by_dist.append(neighbors)
        self.sorted_neighbors_lists.sort(key=len)
        non_duplicate_neighbors = []
        for i in range(len(self.sorted_neighbors_lists)):
            query = self.sorted_neighbors_lists.pop(0)
            if any([
                    fuzz.partial_token_set_ratio(query, x) >
                    settings.max_item_similarity
                    for x in self.sorted_neighbors_lists
            ]):
                continue
            non_duplicate_neighbors.append(query)
        run_time = (dt.datetime.now() - runtime_start).seconds
        debug.footer(f"Identification complete in {run_time} seconds!")
        debug.divider()

        for neighbor in sorted(non_duplicate_neighbors, key=len, reverse=True):
            if len(neighbor) > 3:
                # items_dict.summarize_neighbor_titles(list(neighbor))
                # neighbor = list(neighbor)
                neighbor = sort_by_time(self.items, neighbor)
                output2.topic(self.items, neighbor)
                output2.summary(self.items, neighbor)
                output2.neighbors(self.items, neighbor)
                print()
                output2.divider()
Exemple #2
0
    def save_to_file(self, filepath):
        debug.header(f"Exporting cache to '{filepath}'...")

        runtime_start = dt.datetime.now()

        # Build JSON-compatible dict for cache items
        cache_dict = {}
        for key in self.items:
            item_dict = self.items[key]
            # if isinstance(item_dict["pub_date"], dt.datetime):
            #     pub_date = datetime_to_string(item_dict["pub_date"])
            # else:
            #     pub_date = item_dict["pub_date"]

            cache_dict[key] = {
                "title": item_dict["title"],
                "source": item_dict["source"],
                "link": item_dict["link"],
                "pub_date": datetime_to_string(item_dict["pub_date"]),
                "content": item_dict["content"]
            }

        # Store cache info (e.g., n_items)
        export_dict = {
            "information": {
                "n_items": self.n_items
            },
            "cache": cache_dict
        }

        # Export cache.
        with open(filepath, "w") as f:
            json.dump(export_dict, f, indent=4)
        run_time = (dt.datetime.now() - runtime_start).seconds
        debug.footer(f"Export complete in {run_time} seconds!")
Exemple #3
0
    def build_from_file(self, cache_filepath):
        debug.header("Building cache from file...")

        runtime_start = dt.datetime.now()
        with open(cache_filepath, "r") as f:
            import_dict = json.load(f)
        self.n_items = import_dict["information"]["n_items"]
        cache_dict = import_dict["cache"]
        imported_items_count = 0
        skipped_items_count = 0
        for key in cache_dict:
            pub_date = string_to_datetime(cache_dict[key]["pub_date"])
            if time_check(pub_date):
                debug.skip_item(cache_dict[key]['title'],
                                cache_dict[key]['source'],
                                cache_dict[key]['pub_date'])
                skipped_items_count += 1
            else:
                item_dict = {
                    "title": cache_dict[key]["title"],
                    "source": cache_dict[key]["source"],
                    "link": cache_dict[key]["link"],
                    "pub_date": pub_date,
                    "content": cache_dict[key]["content"]
                }
                self.add_item(item_id=int(key),
                              item_dict=item_dict,
                              increment_item_count=False)
                imported_items_count += 1
        run_time = (dt.datetime.now() - runtime_start).seconds
        debug.footer(
            f"Build complete in {run_time} seconds!",
            f"{imported_items_count} items added — {skipped_items_count} items skipped"
        )
Exemple #4
0
    def vectorize(self):
        debug.header("Vectorizing cache contents...")

        runtime_start = dt.datetime.now()
        if not self.content_dict:
            self.build_cluster_dict()
        warnings.filterwarnings("ignore",
                                category=UserWarning,
                                module='sklearn.feature_extraction.text')
        self.tfs = self.tfidf.fit_transform(self.content_dict.values())
        self.model_tf_idf.fit(self.tfs)
        run_time = (dt.datetime.now() - runtime_start).seconds
        debug.footer(f"Vectorization complete in {run_time} seconds!")
Exemple #5
0
    def clean(self):
        debug.header("Cleaning cache...")

        runtime_start = dt.datetime.now()
        count_items_removed = 0
        for key in self.items:
            item_dict = self.items[key]
            if time_check(item_dict["pub_date"]):
                if settings.DIV_show_all_items or settings.DIV_show_removed_items:
                    debug.log(
                        f"Removing '{item_dict['title']}' [{item_dict['source']}]...",
                        indents=1)
                del self.items[key]
                count_items_removed += 1
        run_time = (dt.datetime.now() - runtime_start).seconds
        debug.footer(f"Cleaning complete in {run_time} seconds!",
                     f"{count_items_removed} items removed")
Exemple #6
0
import newsy.debugger as debug

PRETRAINED_MODEL = 'facebook/bart-large-cnn'

# debug.log('*' * debug.rem_len(settings.TW_wrapping_width))
# debug.log("")
debug.header(f"Initializing '{PRETRAINED_MODEL}' summarization model...")

runtime_start = dt.datetime.now()

model = BartForConditionalGeneration.from_pretrained(PRETRAINED_MODEL)
tokenizer = BartTokenizer.from_pretrained(PRETRAINED_MODEL)

run_time = (dt.datetime.now() - runtime_start).seconds

debug.footer(f"Initialization complete in {run_time} seconds!")


def field_from_neighbors(items_dict, neighbors_list, field, remove_punc=False):
    text_to_summarize = ""
    for i in range(len(neighbors_list)):
        item_id = neighbors_list[i]
        text = items_dict[item_id][field]
        if remove_punc:
            regex = r"(?<!\w)(\D)\.|(\D)(\.)(\D| |$)"
            sub_string = "\\1\\2\\4"
            text = re.sub(regex, sub_string, text)
            # text = text.replace(".", "") + "."
        if text[-1] != ".":
            text += "."
        text_to_summarize += text + " "
Exemple #7
0
    def build_from_feeds(self, feed_filepath):
        """Builds a cache based on a given dictionary of RSS feeds"""
        debug.header("Building cache from feeds...")

        runtime_start = dt.datetime.now()
        added_items_count = 0
        items_skipped_count = 0
        feed_count = 0
        feed_dict = feeds.import_feeds(feeds_filepath=feed_filepath)
        num_feeds = len(feed_dict)

        for feed in feed_dict:
            feed_count += 1
            debug.feed_label(feed, feed_count, num_feeds)
            rss_url = f"https://news.google.com/rss/search?q=when:" \
                      f"{settings.time_period_in_hours}h" \
                      f"+allinurl:{feed_dict[feed]}&ceid=US:en&hl=en-US&gl=US"
            d = feedparser.parse(rss_url)

            for entry in d.entries:

                # Get title
                title, source = feeds.clean_title(entry.title)

                # Get link
                url = feeds.clean_url(entry.link)

                if feeds.is_forbidden(url) or hash(
                        url) in self.url_hashes or hash(
                            title) in self.title_hashes:
                    debug.skip_item(title, source, url)
                    items_skipped_count += 1
                    continue

                # Get publication date
                pub_date = dt.datetime(*entry.published_parsed[:6])

                # Get content
                headers = {
                    "User-Agent":
                    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, "
                    "like Gecko) Chrome/51.0.2704.103 Safari/537.36"
                }
                try:
                    r = requests.get(url, headers=headers)
                except:
                    debug.skip_item(title, source, url)
                    items_skipped_count += 1
                    continue
                content = feeds.extract_content(r.content)
                content = content.encode("ascii", "ignore")
                content = content.decode()

                if hash(content) in self.content_hashes:
                    debug.skip_item(title, source, url)
                    items_skipped_count += 1
                    continue

                # Get keywords
                # keywords = preprocess(content)

                # Add entry to cache.
                item_dict = {
                    "title": title,
                    "source": source,
                    "link": url,
                    "pub_date": pub_date,
                    "content": content
                }
                self.add_item(item_id=self.n_items, item_dict=item_dict)
                added_items_count += 1
        run_time = (dt.datetime.now() - runtime_start).seconds
        debug.footer(
            f"Build complete in {run_time} seconds!",
            f"{added_items_count} items added — {items_skipped_count} items skipped"
        )
Exemple #8
0
    def build_from_single_rss(self, rss_url):
        """Builds a cache based on a given RSS feed"""
        debug.header("Building cache from single RSS feed...")

        runtime_start = dt.datetime.now()
        added_items_count = 0
        items_skipped_count = 0

        d = feedparser.parse(rss_url)

        for entry in d.entries:
            # Get title
            title = entry.title
            source = entry.author

            # Get link
            url = feeds.lex_url(entry.link)

            if feeds.is_forbidden(url) or hash(url) in self.url_hashes or hash(
                    title) in self.title_hashes:
                debug.skip_item(title, source, url)
                items_skipped_count += 1
                continue

            # Get publication date
            pub_date = dt.datetime(*entry.published_parsed[:6])

            # Get content
            headers = {
                "User-Agent":
                "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, "
                "like Gecko) Chrome/51.0.2704.103 Safari/537.36"
            }
            try:
                r = requests.get(
                    url,
                    headers=headers,
                )
            except:
                debug.skip_item(title, source, url + " (unable to connect)")
                items_skipped_count += 1
                continue
            content = feeds.extract_content(r.content)
            content = content.encode("ascii", "ignore")
            content = content.decode()

            if hash(content) in self.content_hashes:
                debug.skip_item(title, source, url)
                items_skipped_count += 1
                continue

            # Get keywords
            # keywords = preprocess(content)

            # Add entry to cache.
            item_dict = {
                "title": title,
                "source": source,
                "link": url,
                "pub_date": pub_date,
                "content": content
            }
            self.add_item(item_id=self.n_items, item_dict=item_dict)
            added_items_count += 1
        run_time = (dt.datetime.now() - runtime_start).seconds
        debug.footer(
            f"Build complete in {run_time} seconds!",
            f"{added_items_count} items added — {items_skipped_count} items skipped"
        )