def run(self): """ add synonyms of new keywords into query_word table :return: Nothing """ fd, fo = 0, 0 try: if "--no-lock" not in sys.argv: fo = open(os.getenv("HOME") + "/.event-detection-active", "wb") fd = fo.fileno() fcntl.lockf(fd, fcntl.LOCK_EX) ds = DataSource() unprocessed_queries = ds.get_unprocessed_queries() for query in unprocessed_queries: # access into the queries SQL table and find which queries are not process THRESHOLD = None print(query) query_parts = {"query": " ".join(filter(None, query[1:6])), "subject": query[1], "verb": query[2], "direct_obj": query[3], "indirect_obj": query[4], "location": query[5]} print(query_parts) synonyms = Query(query[0], query_parts, THRESHOLD).get_synonyms() print(synonyms) # synonyms = {NN: {word1: [list of synonym], word2: [list of synonym],...}, VB..} for pos_group in synonyms: print(synonyms[pos_group]) for query_word in synonyms[pos_group]: ds.insert_query_word_synonym(query[0], query_word, pos_group, synonyms[pos_group][query_word]) ds.post_query_processor_update(query[0]) finally: if "--no-lock" not in sys.argv: fcntl.lockf(fd, fcntl.LOCK_UN) fo.close()
class Cluster: ds = DataSource() def __init__(self, id): """ Creates a cluster object :param id: cluster id from algorithm :return: None """ self.id = id self.article_ids = [] self.article_titles = [] self.keywords = None def add_article(self, article_id, article_title): """ Adds an article to the cluster :param article_id: article id :param article_title: article title :return: None """ self.article_ids.append(article_id) self.article_titles.append(article_title) def is_valid_cluster(self, num_articles): """ Checks if cluster is valid: meaning it contains more than one article, but fewer article than a quarter of all the articles considered :param num_articles: number of articles considered :return: true if valid cluster; else false """ return num_articles / 4 > len(self.article_ids) > 1 def get_keywords(self): """ gets the cumulative list of keywords for the cluster :return: set of keywords """ # don't build keywords dictionary if it has already been built if self.keywords is None: self.keywords = set() keyword_dicts = [json.loads(self.ds.get_article_keywords(article)[0]) for article in self.article_ids] for kw_dict in keyword_dicts: for pos in kw_dict: for kw in kw_dict[pos]: self.keywords.add(kw[0]) return self.keywords def get_article_ids(self): """ Returns the article ids associated with this cluster :return: list of articles ids """ return self.article_ids
def __init__(self): """ Initializes notification clients. :return: None """ self.datasource = DataSource() self.phone_client = TwilioRestClient(twilio_account_sid, twilio_auth_token) self.email_client = sendgrid.SendGridClient(sendgrid_api_key)
def __init__(self): """ Initialize class variables :return: None """ self.ds = DataSource() self.ids = [] self.num_entries = 0 self.num_articles = 0 self.num_article_words = 0 self.article_titles = [] self.stopwords = set(nltk.corpus.stopwords.words('english')) self.lemmatizer = WordNetLemmatizer()
def run(self): """ adds keywords as a JSON string to articles in database :return: Nothing """ fd, fo = 0, 0 try: if "--no-lock" not in sys.argv: path = articles_path fo = open(os.getenv("HOME") + "/.event-detection-active", "wb") fd = fo.fileno() fcntl.lockf(fd, fcntl.LOCK_EX) ds = DataSource() unprocessed_articles = ds.get_unprocessed_articles() for article in unprocessed_articles: try: extractor = KeywordExtractor() article_id = article[0] article_filename = article[2] article_title = article[1] article_url = article[3] article_source = article[4] article_file = open( os.getcwd() + "/articles/{0}".format(article_filename), "r", encoding="utf8") body = article_file.read() article_file.close() article_with_body = Article(article_title, body, article_url, article_source) keywords = extractor.extract_keywords(article_with_body) keyword_string = json.dumps(keywords) ds.add_keywords_to_article(article_id, keyword_string) ds.add_article_to_query_articles(article_id) except (FileNotFoundError, IOError): print("Wrong file or file path", file=sys.stderr) finally: if "--no-lock" not in sys.argv: fcntl.lockf(fd, fcntl.LOCK_UN) fo.close()
def validate(self, query_id, article_id): """ validate -- evaluates how much article validates query :param query: query to validate :param article: article to validate with :return: match_percentage (relative measure of how well article validates query) """ max_match_value = 0 # Need to process query and article formats ds = DataSource() query_synonyms_raw = ds.get_query_synonyms( query_id ) # [('and', 'CC', 'Random', []), ('julia', 'NN', 'Random', []), ('phuong', 'JJ', 'Random', []), ('test', 'NN', 'Random', ['trial', 'run', 'mental_test', 'test', 'tryout', 'trial_run', 'exam', 'examination', 'mental_testing', 'psychometric_test']), ('validator', 'NN', 'Random', [])] query_synonyms = {} for w in query_synonyms_raw: query_synonyms[self.normalize_keyword( w[0])] = [self.normalize_keyword(synonym) for synonym in w[3]] article_keyword = json.loads( ds.get_article_keywords(article_id) [0]) #{NN: [list of keywords], VB:[list of verb keywords]} article_keywords_flat = set() for pos in article_keyword: for item in article_keyword[pos]: article_keywords_flat.add(self.normalize_keyword(item[0])) match_value = 0 # find matches for query_word in query_synonyms: max_match_value += 2 if query_word in article_keywords_flat: match_value += 2 else: for synonym in query_synonyms[query_word]: if synonym in article_keywords_flat: match_value += 1 break match_percentage = 0 if max_match_value == 0 else (match_value / max_match_value) return match_percentage
import sys import os sys.path.insert(0, os.path.abspath('..')) sys.path.insert(0, os.path.abspath('.')) from flask import Flask, render_template, request, redirect import subprocess from Utils import subprocess_helpers from Utils.DataSource import * app = Flask(__name__) dataSource = DataSource() def launch_preprocessors(): process = subprocess.Popen(subprocess_helpers.python_path + " Daemons/QueryProcessorDaemon.py && " + subprocess_helpers.python_path + " Daemons/ArticleProcessorDaemon.py", executable=subprocess_helpers.executable, shell=True, universal_newlines=True) @app.route("/", methods=["GET"]) def queries(): # Get lists of query from database with counts of associated articles all_queries = dataSource.queries_route() queries_formatted = [{ "id": q[0], "subject": q[1],