Beispiel #1
0
    def run(self):
        """
        add synonyms of new keywords into query_word table
        :return: Nothing
        """
        fd, fo = 0, 0
        try:
            if "--no-lock" not in sys.argv:
                fo = open(os.getenv("HOME") + "/.event-detection-active", "wb")
                fd = fo.fileno()
                fcntl.lockf(fd, fcntl.LOCK_EX)
            ds = DataSource()
            unprocessed_queries = ds.get_unprocessed_queries()
            for query in unprocessed_queries:
                # access into the queries SQL table and find which queries are not process
                THRESHOLD = None
                print(query)
                query_parts = {"query": " ".join(filter(None, query[1:6])), "subject": query[1], "verb": query[2],
                               "direct_obj": query[3], "indirect_obj": query[4], "location": query[5]}
                print(query_parts)
                synonyms = Query(query[0], query_parts, THRESHOLD).get_synonyms()
                print(synonyms)
                # synonyms = {NN: {word1: [list of synonym], word2: [list of synonym],...}, VB..}

                for pos_group in synonyms:
                    print(synonyms[pos_group])
                    for query_word in synonyms[pos_group]:
                        ds.insert_query_word_synonym(query[0], query_word, pos_group, synonyms[pos_group][query_word])

                ds.post_query_processor_update(query[0])
        finally:
            if "--no-lock" not in sys.argv:
                fcntl.lockf(fd, fcntl.LOCK_UN)
                fo.close()
class Cluster:
    ds = DataSource()

    def __init__(self, id):
        """
        Creates a cluster object
        :param id: cluster id from algorithm
        :return: None
        """
        self.id = id
        self.article_ids = []
        self.article_titles = []
        self.keywords = None

    def add_article(self, article_id, article_title):
        """
        Adds an article to the cluster
        :param article_id: article id
        :param article_title: article title
        :return: None
        """
        self.article_ids.append(article_id)
        self.article_titles.append(article_title)


    def is_valid_cluster(self, num_articles):
        """
        Checks if cluster is valid: meaning it contains more than one article,
        but fewer article than a quarter of all the articles considered
        :param num_articles: number of articles considered
        :return: true if valid cluster; else false
        """
        return num_articles / 4 > len(self.article_ids) > 1


    def get_keywords(self):
        """
        gets the cumulative list of keywords for the cluster
        :return: set of keywords
        """
        # don't build keywords dictionary if it has already been built
        if self.keywords is None:
            self.keywords = set()
            keyword_dicts = [json.loads(self.ds.get_article_keywords(article)[0])
                            for article in self.article_ids]
            for kw_dict in keyword_dicts:
                for pos in kw_dict:
                    for kw in kw_dict[pos]:
                        self.keywords.add(kw[0])
        return self.keywords

    def get_article_ids(self):
        """
        Returns the article ids associated with this cluster
        :return: list of articles ids
        """
        return self.article_ids
Beispiel #3
0
 def __init__(self):
     """
     Initializes notification clients.
     :return: None
     """
     self.datasource = DataSource()
     self.phone_client = TwilioRestClient(twilio_account_sid,
                                          twilio_auth_token)
     self.email_client = sendgrid.SendGridClient(sendgrid_api_key)
 def __init__(self):
     """
     Initialize class variables
     :return: None
     """
     self.ds = DataSource()
     self.ids = []
     self.num_entries = 0
     self.num_articles = 0
     self.num_article_words = 0
     self.article_titles = []
     self.stopwords = set(nltk.corpus.stopwords.words('english'))
     self.lemmatizer = WordNetLemmatizer()
Beispiel #5
0
    def run(self):
        """
        adds keywords as a JSON string to articles in database
        :return: Nothing
        """
        fd, fo = 0, 0
        try:
            if "--no-lock" not in sys.argv:
                path = articles_path
                fo = open(os.getenv("HOME") + "/.event-detection-active", "wb")
                fd = fo.fileno()
                fcntl.lockf(fd, fcntl.LOCK_EX)
            ds = DataSource()
            unprocessed_articles = ds.get_unprocessed_articles()
            for article in unprocessed_articles:
                try:
                    extractor = KeywordExtractor()
                    article_id = article[0]
                    article_filename = article[2]
                    article_title = article[1]
                    article_url = article[3]
                    article_source = article[4]
                    article_file = open(
                        os.getcwd() + "/articles/{0}".format(article_filename),
                        "r",
                        encoding="utf8")
                    body = article_file.read()
                    article_file.close()

                    article_with_body = Article(article_title, body,
                                                article_url, article_source)
                    keywords = extractor.extract_keywords(article_with_body)
                    keyword_string = json.dumps(keywords)
                    ds.add_keywords_to_article(article_id, keyword_string)
                    ds.add_article_to_query_articles(article_id)
                except (FileNotFoundError, IOError):
                    print("Wrong file or file path", file=sys.stderr)
        finally:
            if "--no-lock" not in sys.argv:
                fcntl.lockf(fd, fcntl.LOCK_UN)
                fo.close()
Beispiel #6
0
    def validate(self, query_id, article_id):
        """
        validate -- evaluates how much article validates query
        :param query: query to validate
        :param article: article to validate with
        :return: match_percentage (relative measure of how well article validates query)
        """
        max_match_value = 0
        # Need to process query and article formats
        ds = DataSource()
        query_synonyms_raw = ds.get_query_synonyms(
            query_id
        )  # [('and', 'CC', 'Random', []), ('julia', 'NN', 'Random', []), ('phuong', 'JJ', 'Random', []), ('test', 'NN', 'Random', ['trial', 'run', 'mental_test', 'test', 'tryout', 'trial_run', 'exam', 'examination', 'mental_testing', 'psychometric_test']), ('validator', 'NN', 'Random', [])]
        query_synonyms = {}

        for w in query_synonyms_raw:
            query_synonyms[self.normalize_keyword(
                w[0])] = [self.normalize_keyword(synonym) for synonym in w[3]]
        article_keyword = json.loads(
            ds.get_article_keywords(article_id)
            [0])  #{NN: [list of keywords], VB:[list of verb keywords]}

        article_keywords_flat = set()
        for pos in article_keyword:
            for item in article_keyword[pos]:
                article_keywords_flat.add(self.normalize_keyword(item[0]))

        match_value = 0
        # find matches
        for query_word in query_synonyms:
            max_match_value += 2
            if query_word in article_keywords_flat:
                match_value += 2
            else:
                for synonym in query_synonyms[query_word]:
                    if synonym in article_keywords_flat:
                        match_value += 1
                        break
        match_percentage = 0 if max_match_value == 0 else (match_value /
                                                           max_match_value)
        return match_percentage
import sys
import os
sys.path.insert(0, os.path.abspath('..'))
sys.path.insert(0, os.path.abspath('.'))

from flask import Flask, render_template, request, redirect
import subprocess
from Utils import subprocess_helpers
from Utils.DataSource import *

app = Flask(__name__)
dataSource = DataSource()


def launch_preprocessors():
    process = subprocess.Popen(subprocess_helpers.python_path +
                               " Daemons/QueryProcessorDaemon.py && " +
                               subprocess_helpers.python_path +
                               " Daemons/ArticleProcessorDaemon.py",
                               executable=subprocess_helpers.executable,
                               shell=True,
                               universal_newlines=True)


@app.route("/", methods=["GET"])
def queries():
    # Get lists of query from database with counts of associated articles
    all_queries = dataSource.queries_route()
    queries_formatted = [{
        "id": q[0],
        "subject": q[1],