def validate(self, query_id, article_id):
        """
        validate -- evaluates how much article validates query
        :param query: query to validate
        :param article: article to validate with
        :return: match_percentage (relative measure of how well article validates query)
        """
        max_match_value = 0
        # Need to process query and article formats
        ds = DataSource()
        query_synonyms_raw = ds.get_query_synonyms(query_id) # [('and', 'CC', 'Random', []), ('julia', 'NN', 'Random', []), ('phuong', 'JJ', 'Random', []), ('test', 'NN', 'Random', ['trial', 'run', 'mental_test', 'test', 'tryout', 'trial_run', 'exam', 'examination', 'mental_testing', 'psychometric_test']), ('validator', 'NN', 'Random', [])]
        query_synonyms = {}

        for w in query_synonyms_raw:
            query_synonyms[self.normalize_keyword(w[0])] = [self.normalize_keyword(synonym) for synonym in w[3]]
        article_keyword = json.loads(ds.get_article_keywords(article_id)[0]) #{NN: [list of keywords], VB:[list of verb keywords]}

        article_keywords_flat = set()
        for pos in article_keyword:
            for item in article_keyword[pos]:
                article_keywords_flat.add(self.normalize_keyword(item[0]))

        match_value = 0
        # find matches
        for query_word in query_synonyms:
            max_match_value += 2
            if query_word in article_keywords_flat:
                match_value += 2
            else:
                for synonym in query_synonyms[query_word]:
                    if synonym in article_keywords_flat:
                        match_value += 1
                        break
        match_percentage = 0 if max_match_value == 0 else (match_value / max_match_value)
        return match_percentage
Example #2
0
def convert_symbols_to_raw(src, dst, model, level, tolevel=0):
    """
    convert a symbolic time series to lower level or raw data
    """
    if level == "raw":
        return 
    if level == "symbol":
        level = 0
    if level == "rle":
        level = 1
    if level == "statecluster":
        level = 2
        prle = pr.RLEProcess()
        dat = ds.FileDataSource(src, src)
        dat.load()
        dat.data = prle.batch_process(dat.data)
        dat.save()
    if int(tolevel) == 0:
        ctr = cv.ToRaw()
        ctr.convert(src, dst, model, int(level))
    else:
        cts = cv.ToSymbols()
        cts.convert(src, dst, model, int(level))
        if int(tolevel) == 2:
            prle = pr.RLEProcess()
            dat = ds.FileDataSource(dst, dst)
            dat.load()
            dat.data = prle.batch_process(dat.data)
            dat.save()
Example #3
0
 def __init__(self):
     """
     Initializes notification clients.
     :return: None
     """
     self.datasource = DataSource()
     self.phone_client = TwilioRestClient(twilio_account_sid,
                                          twilio_auth_token)
     self.email_client = sendgrid.SendGridClient(sendgrid_api_key)
Example #4
0
def get_distance(file0, file1, rate):
    dat0 = ds.FileDataSource(file0, None)
    dat1 = ds.FileDataSource(file1, None)
    dat0.load()
    dat1.load()
    gdt = fu.Get_Distance()
    dist = gdt.levenshtein(dat0.data, dat1.data, int(rate))
    print "total-distance (d) = %d"%(dist[0])
    print "total-time-length (l) = %d"%(dist[1])
    print "normalized-distance (d/l) = %f"%(dist[0]*1.0/dist[1])
 def __init__(self):
     """
     Initialize class variables
     :return: None
     """
     self.ds = DataSource()
     self.ids = []
     self.num_entries = 0
     self.num_articles = 0
     self.num_article_words = 0
     self.article_titles = []
     self.stopwords = set(nltk.corpus.stopwords.words('english'))
     self.lemmatizer = WordNetLemmatizer()
Example #6
0
def find_states_spclust(inputfile, outputname, rate, dimensions, wgrid, wnbr, ememory, ethreshold, mindist):
    """
    batch process using spclust symbolization
    - input csv file with triple (time start, time end, values)
    - base name for output files (can include a folder)
    - sampling rate of the input time series
    - number of dimensions of the input time series
    - Spclust grid size
    - Spclust count threshold
    - StateFinder fading factor
    - StateFinder prediciton error threshold
    - StateFinder min distance for clustering segments (0-1)
    """
    call(["java", "-jar", "./Spclust/SpComputeModel.jar", inputfile,
          dimensions, wgrid, wnbr, outputname+"-model.spc"])
    call(["java", "-jar", "./Spclust/SpComputeSymbols.jar",
          outputname+"-model.spc", inputfile, outputname+"-symbol.csv"])
    nbclusters = int(open(outputname+"-model.spcn", 'r').readline())
    src = ds.FileDataSource(outputname+"-symbol.csv", outputname+"-statefinder.csv")
    rel = pr.RLEProcess()
    sem = pr.SegmentSparseProcess(rate, ethreshold, ememory)
    clu = pr.ClusterSparseProcess(mindist, nbclusters)
    src.load()
    src.data = rel.batch_process(src.data)
    src.save_to(outputname+"-rle.csv")
    src.data = rel.batch_process(src.data)
    segments = sem.batch_process(src.data)
    (src.data, lookup) = clu.batch_process(segments, src.data)
    src.save()
    lookups = {0:lt.SpclustSymbolLookupTable(outputname+"-model.spc"),
               1:lt.ExpandLookupTable(rate),
               2:(lt.ClusterSparseLookupTable(lookup, rate))}
    lkf = open(outputname+"-model.mdl", 'w')
    pickle.dump(lookups, lkf)
    lkf.close()
 def __init__(self):
     """
     Initializes notification clients.
     :return: None
     """
     self.datasource = DataSource()
     self.phone_client = TwilioRestClient(twilio_account_sid, twilio_auth_token)
     self.email_client = sendgrid.SendGridClient(sendgrid_api_key)
Example #8
0
    def run(self):
        """
        add synonyms of new keywords into query_word table
        :return: Nothing
        """
        fd, fo = 0, 0
        try:
            if "--no-lock" not in sys.argv:
                fo = open(os.getenv("HOME") + "/.event-detection-active", "wb")
                fd = fo.fileno()
                fcntl.lockf(fd, fcntl.LOCK_EX)
            ds = DataSource()
            unprocessed_queries = ds.get_unprocessed_queries()
            for query in unprocessed_queries:
                # access into the queries SQL table and find which queries are not process
                THRESHOLD = None
                print(query)
                query_parts = {"query": " ".join(filter(None, query[1:6])), "subject": query[1], "verb": query[2],
                               "direct_obj": query[3], "indirect_obj": query[4], "location": query[5]}
                print(query_parts)
                synonyms = Query(query[0], query_parts, THRESHOLD).get_synonyms()
                print(synonyms)
                # synonyms = {NN: {word1: [list of synonym], word2: [list of synonym],...}, VB..}

                for pos_group in synonyms:
                    print(synonyms[pos_group])
                    for query_word in synonyms[pos_group]:
                        ds.insert_query_word_synonym(query[0], query_word, pos_group, synonyms[pos_group][query_word])

                ds.post_query_processor_update(query[0])
        finally:
            if "--no-lock" not in sys.argv:
                fcntl.lockf(fd, fcntl.LOCK_UN)
                fo.close()
class Cluster:
    ds = DataSource()

    def __init__(self, id):
        """
        Creates a cluster object
        :param id: cluster id from algorithm
        :return: None
        """
        self.id = id
        self.article_ids = []
        self.article_titles = []
        self.keywords = None

    def add_article(self, article_id, article_title):
        """
        Adds an article to the cluster
        :param article_id: article id
        :param article_title: article title
        :return: None
        """
        self.article_ids.append(article_id)
        self.article_titles.append(article_title)


    def is_valid_cluster(self, num_articles):
        """
        Checks if cluster is valid: meaning it contains more than one article,
        but fewer article than a quarter of all the articles considered
        :param num_articles: number of articles considered
        :return: true if valid cluster; else false
        """
        return num_articles / 4 > len(self.article_ids) > 1


    def get_keywords(self):
        """
        gets the cumulative list of keywords for the cluster
        :return: set of keywords
        """
        # don't build keywords dictionary if it has already been built
        if self.keywords is None:
            self.keywords = set()
            keyword_dicts = [json.loads(self.ds.get_article_keywords(article)[0])
                            for article in self.article_ids]
            for kw_dict in keyword_dicts:
                for pos in kw_dict:
                    for kw in kw_dict[pos]:
                        self.keywords.add(kw[0])
        return self.keywords

    def get_article_ids(self):
        """
        Returns the article ids associated with this cluster
        :return: list of articles ids
        """
        return self.article_ids
Example #10
0
def convert_rle(src, dst):
    """
    apply RLE compression
    """
    dat = ds.FileDataSource(src, dst)
    prle = pr.RLEProcess()
    dat.load()
    dat.data = prle.batch_process(dat.data)
    dat.save()
Example #11
0
def convert_median_filter(src, dst, win):
    """
    Apply median filtering to the time series
    """
    dat = ds.FileDataSource(src, dst)
    pmf = pr.MedianFilteringProcess(win)
    dat.load()
    dat.data = pmf.batch_process(dat.data)
    dat.save()
Example #12
0
def split_file_by(filename, folder, offset=0, duration=86400):
    """
    split the file for applying the forecasting algorithm
    """
    src = ds.FileDataSource(filename, None)
    cut = fu.PeriodicCutProcess(int(duration), int(offset))
    src.load()
    src.data = cut.batch_process(src.data)
    spl = fu.Splitter(src.data)
    spl.splitFiles(folder, int(duration), int(offset))
Example #13
0
    def validate(self, query_id, article_id):
        """
        validate -- evaluates how much article validates query
        :param query: query to validate
        :param article: article to validate with
        :return: match_percentage (relative measure of how well article validates query)
        """
        max_match_value = 0
        # Need to process query and article formats
        ds = DataSource()
        query_synonyms_raw = ds.get_query_synonyms(
            query_id
        )  # [('and', 'CC', 'Random', []), ('julia', 'NN', 'Random', []), ('phuong', 'JJ', 'Random', []), ('test', 'NN', 'Random', ['trial', 'run', 'mental_test', 'test', 'tryout', 'trial_run', 'exam', 'examination', 'mental_testing', 'psychometric_test']), ('validator', 'NN', 'Random', [])]
        query_synonyms = {}

        for w in query_synonyms_raw:
            query_synonyms[self.normalize_keyword(
                w[0])] = [self.normalize_keyword(synonym) for synonym in w[3]]
        article_keyword = json.loads(
            ds.get_article_keywords(article_id)
            [0])  #{NN: [list of keywords], VB:[list of verb keywords]}

        article_keywords_flat = set()
        for pos in article_keyword:
            for item in article_keyword[pos]:
                article_keywords_flat.add(self.normalize_keyword(item[0]))

        match_value = 0
        # find matches
        for query_word in query_synonyms:
            max_match_value += 2
            if query_word in article_keywords_flat:
                match_value += 2
            else:
                for synonym in query_synonyms[query_word]:
                    if synonym in article_keywords_flat:
                        match_value += 1
                        break
        match_percentage = 0 if max_match_value == 0 else (match_value /
                                                           max_match_value)
        return match_percentage
Example #14
0
def find_states(inputfile, outputname, rate, smethod, snbr, ememory, ethreshold, mindist):
    """
    batch process using standard symbolization
    - input csv file with triple (time start, time end, value)
    - base name for output files (can include a folder)
    - sampling rate of the input time series
    - symbolization method (0:unifom, 1:median, 2:distinct median)
    - number of symbols to generates
    - StateFinder fading factor
    - StateFinder prediciton error threshold
    - StateFinder min distance for clustering segments (0-1)
    """
    src = ds.FileDataSource(inputfile, outputname+"-statefinder.csv")
    src.load()
    enc = sbz.UniformSymbolizer()
    if smethod == "1":
        enc = sbz.MedianSymbolizer()
    if smethod == "2":
        enc = sbz.DistinctMedianSymbolizer()
    enc.load(src.data)
    (sep, mini, maxi) = enc.get_separators(int(snbr))
    sym = pr.SymbolizeProcess(1, sep)
    rel = pr.RLEProcess()
    sem = pr.SegmentSparseProcess(rate, ethreshold, ememory)
    clu = pr.ClusterSparseProcess(mindist, int(snbr)+1)

    src.data = sym.batch_process(src.data)
    src.save_to(outputname+"-symbol.csv")
    src.data = rel.batch_process(src.data)
    src.save_to(outputname+"-rle.csv")
    segments = sem.batch_process(src.data)
    (src.data, lookup) = clu.batch_process(segments, src.data)
    src.save()
    lookups = {0:lt.SymbolLookupTable(sep, mini, maxi),
               1:lt.ExpandLookupTable(rate),
               2:(lt.ClusterSparseLookupTable(lookup, rate))}
    lkf = open(outputname+"-model.mdl", 'w')
    pickle.dump(lookups, lkf)
    lkf.close()
Example #15
0
    def run(self):
        """
        adds keywords as a JSON string to articles in database
        :return: Nothing
        """
        fd, fo = 0, 0
        try:
            if "--no-lock" not in sys.argv:
                path = articles_path
                fo = open(os.getenv("HOME") + "/.event-detection-active", "wb")
                fd = fo.fileno()
                fcntl.lockf(fd, fcntl.LOCK_EX)
            ds = DataSource()
            unprocessed_articles = ds.get_unprocessed_articles()
            for article in unprocessed_articles:
                try:
                    extractor = KeywordExtractor()
                    article_id = article[0]
                    article_filename = article[2]
                    article_title = article[1]
                    article_url = article[3]
                    article_source = article[4]
                    article_file = open(
                        os.getcwd() + "/articles/{0}".format(article_filename),
                        "r",
                        encoding="utf8")
                    body = article_file.read()
                    article_file.close()

                    article_with_body = Article(article_title, body,
                                                article_url, article_source)
                    keywords = extractor.extract_keywords(article_with_body)
                    keyword_string = json.dumps(keywords)
                    ds.add_keywords_to_article(article_id, keyword_string)
                    ds.add_article_to_query_articles(article_id)
                except (FileNotFoundError, IOError):
                    print("Wrong file or file path", file=sys.stderr)
        finally:
            if "--no-lock" not in sys.argv:
                fcntl.lockf(fd, fcntl.LOCK_UN)
                fo.close()
    def run(self):
        """
        adds keywords as a JSON string to articles in database
        :return: Nothing
        """
        fd, fo = 0, 0
        try:
            if "--no-lock" not in sys.argv:
                path = articles_path
                fo = open(os.getenv("HOME") + "/.event-detection-active", "wb")
                fd = fo.fileno()
                fcntl.lockf(fd, fcntl.LOCK_EX)
            ds = DataSource()
            unprocessed_articles = ds.get_unprocessed_articles()
            for article in unprocessed_articles:
                try:
                    extractor = KeywordExtractor()
                    article_id = article[0]
                    article_filename = article[2]
                    article_title = article[1]
                    article_url = article[3]
                    article_source = article[4]
                    article_file = open(os.getcwd()+"/articles/{0}".format(article_filename), "r", encoding="utf8")
                    body = article_file.read()
                    article_file.close()

                    article_with_body = Article(article_title, body, article_url, article_source)
                    keywords = extractor.extract_keywords(article_with_body)
                    keyword_string = json.dumps(keywords)
                    ds.add_keywords_to_article(article_id, keyword_string)
                    ds.add_article_to_query_articles(article_id)
                except (FileNotFoundError, IOError):
                    print("Wrong file or file path", file=sys.stderr)
        finally:
            if "--no-lock" not in sys.argv:
                fcntl.lockf(fd, fcntl.LOCK_UN)
                fo.close()
Example #17
0
class Notifier:
    """
    Used to notify user of query detection
    """

    bitly_api_url = "https://api-ssl.bitly.com"

    def __init__(self):
        """
        Initializes notification clients.
        :return: None
        """
        self.datasource = DataSource()
        self.phone_client = TwilioRestClient(twilio_account_sid,
                                             twilio_auth_token)
        self.email_client = sendgrid.SendGridClient(sendgrid_api_key)

    def check_valid_phone(self, phone):
        if phone is None:
            return False
        return (re.match(r'\+1[0-9]{9}', phone) != None)

    def check_valid_email(self, email):
        if email is None:
            return False
        return (re.match(r'[^\.]+@[^\.]+\.[^\.]+', email) != None)

    def alert_phone(self, text):
        """
        Sends text message
        :param message: the text body
        :return: None
        """
        if self.check_valid_phone(self.phone):
            try:
                self.phone_client.messages.create(body=text,
                                                  to=self.phone,
                                                  from_=twilio_number)
            except:
                print(
                    "Twilio Error. If using a trial account, make sure phone number is verified with twilio at twilio.com/user/account/phone-numbers/verified"
                )

    def alert_email(self, text):
        """
        Sends email message
        :param text: the email body in html
        :return: None
        """
        if self.check_valid_email(self.email):
            message = sendgrid.Mail()
            message.add_to(self.email)

            message.set_from(from_email)
            message.set_subject("Event Detection")
            message.set_html(text)

            status_code, status_message = self.email_client.send(message)
            if int(status_code) != 200:
                print("Error " + str(status_code) + " : " +
                      str(status_message))

    def on_validation(self, query_id, article_ids):
        """
        Notifies user on validation
        :param query_id: query that was validated
        :param article_ids: articles that validated query
        :return: None
        """
        query_string = " ".join(self.datasource.get_query_elements(query_id))
        article_data = []
        for article_id in article_ids:
            article_url = self.get_article_shortlink(
                self.datasource.get_article_url(article_id))
            article_title = self.datasource.get_article_title(article_id)
            article_data.append((article_title, article_url))

        html = self.format_html(query_string, article_data)
        texts = self.format_plaintext(query_string, article_data)

        self.phone, self.email = self.datasource.get_email_and_phone(query_id)
        self.alert_email(html)
        for text in texts:
            self.alert_phone(text)

    @staticmethod
    def format_html(query_string, article_data):
        """
        formats body of email
        :param query_string: query that was validated
        :param article_data: article that validated query
        :return: html of email body
        """
        html = "<h1>{query}</h1><p>Articles:</p>".format(query=query_string)
        for article in article_data:
            article_title = article[0]
            article_url = article[1]
            html += "<p><a href=\"{url}\">{title}</a></p>".format(
                url=article_url, title=article_title)
        return html

    @staticmethod
    def format_plaintext(query_string, article_data):
        """
        formats text message
        formats body of email
        :param query_string: query that was validated
        :param article_data: article that validated query
        :return: text body
        """
        texts = []
        text = "Event Detected!\nQuery: {query}\nArticles: ".format(
            query=query_string)
        for article in article_data:
            article_title = article[0]
            article_url = article[1]
            next_article = "\n{title}\nLink {url}\n".format(
                url=article_url, title=article_title)
            if len(text) + len(next_article) > 1600:
                texts.append(text)
                text = "Event Detected!\nQuery: {query}\nArticles: ".format(
                    query=query_string)
            text += next_article
        texts.append(text)
        return texts

    def get_article_shortlink(self, article_url):
        """
        Gets a shortlink from bitly for the article url
        :param article_url: the url to shorten
        :return: the shortened url if successful (otherwise just the article url)
        """
        payload = {
            "longUrl": article_url,
            "login": bitly_api_login,
            "apiKey": bitly_api_key
        }
        response = requests.get(self.bitly_api_url + "/v3/shorten",
                                params=payload)
        response_json = response.json()
        # look for data -> url -> short url in response_json
        # if it's not there, just return the old url
        if "data" in response_json and "url" in response_json["data"]:
            return response_json["data"]["url"]
        return article_url
class Notifier:
    """
    Used to notify user of query detection
    """

    bitly_api_url = "https://api-ssl.bitly.com"

    def __init__(self):
        """
        Initializes notification clients.
        :return: None
        """
        self.datasource = DataSource()
        self.phone_client = TwilioRestClient(twilio_account_sid, twilio_auth_token)
        self.email_client = sendgrid.SendGridClient(sendgrid_api_key)

    def check_valid_phone(self, phone):
        if phone is None:
            return False
        return (re.match(r'\+1[0-9]{9}', phone) != None)

    def check_valid_email(self, email):
        if email is None:
            return False
        return (re.match(r'[^\.]+@[^\.]+\.[^\.]+', email) != None)


    def alert_phone(self, text):
        """
        Sends text message
        :param message: the text body
        :return: None
        """
        if self.check_valid_phone(self.phone):
            try:
                self.phone_client.messages.create(body=text, to=self.phone, from_=twilio_number)
            except:
                print("Twilio Error. If using a trial account, make sure phone number is verified with twilio at twilio.com/user/account/phone-numbers/verified")

    def alert_email(self, text):
        """
        Sends email message
        :param text: the email body in html
        :return: None
        """
        if self.check_valid_email(self.email):
            message = sendgrid.Mail()
            message.add_to(self.email)

            message.set_from(from_email)
            message.set_subject("Event Detection")
            message.set_html(text)

            status_code, status_message = self.email_client.send(message)
            if int(status_code) != 200:
                print("Error " + str(status_code) + " : " + str(status_message))

    def on_validation(self, query_id, article_ids):
        """
        Notifies user on validation
        :param query_id: query that was validated
        :param article_ids: articles that validated query
        :return: None
        """
        query_string = " ".join(self.datasource.get_query_elements(query_id))
        article_data = []
        for article_id in article_ids:
            article_url = self.get_article_shortlink(self.datasource.get_article_url(article_id))
            article_title = self.datasource.get_article_title(article_id)
            article_data.append((article_title, article_url))

        html = self.format_html(query_string, article_data)
        texts = self.format_plaintext(query_string, article_data)

        self.phone, self.email = self.datasource.get_email_and_phone(query_id)
        self.alert_email(html)
        for text in texts:
            self.alert_phone(text)

    @staticmethod
    def format_html(query_string, article_data):
        """
        formats body of email
        :param query_string: query that was validated
        :param article_data: article that validated query
        :return: html of email body
        """
        html = "<h1>{query}</h1><p>Articles:</p>".format(query = query_string)
        for article in article_data:
            article_title = article[0]
            article_url = article[1]
            html += "<p><a href=\"{url}\">{title}</a></p>".format(url=article_url, title=article_title)
        return html

    @staticmethod
    def format_plaintext(query_string, article_data):
        """
        formats text message
        formats body of email
        :param query_string: query that was validated
        :param article_data: article that validated query
        :return: text body
        """
        texts = []
        text = "Event Detected!\nQuery: {query}\nArticles: ".format(query = query_string)
        for article in article_data:
            article_title = article[0]
            article_url = article[1]
            next_article = "\n{title}\nLink {url}\n".format(url=article_url, title=article_title)
            if len(text) + len(next_article) > 1600:
                texts.append(text)
                text = "Event Detected!\nQuery: {query}\nArticles: ".format(query = query_string)
            text += next_article
        texts.append(text)
        return texts

    def get_article_shortlink(self, article_url):
        """
        Gets a shortlink from bitly for the article url
        :param article_url: the url to shorten
        :return: the shortened url if successful (otherwise just the article url)
        """
        payload = {"longUrl": article_url, "login": bitly_api_login, "apiKey": bitly_api_key}
        response = requests.get(self.bitly_api_url + "/v3/shorten", params=payload)
        response_json = response.json()
        # look for data -> url -> short url in response_json
        # if it's not there, just return the old url
        if "data" in response_json and "url" in response_json["data"]:
            return response_json["data"]["url"]
        return article_url
# coding=utf-8
import tensorflow as tf
import Utils.DataSource as ds

mnist = ds.readMnist("data/")

sess = tf.InteractiveSession()

x = tf.placeholder(tf.float32, shape=[None, 784])
y_ = tf.placeholder(tf.float32, shape=[None, 10])

# Weight Initialization
def weight_variable(shape):
  initial = tf.truncated_normal(shape, stddev=0.1)
  return tf.Variable(initial)

def bias_variable(shape):
  initial = tf.constant(0.1, shape=shape)
  return tf.Variable(initial)

# Convolution and Pooling
def conv2d(x, W):
  return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')

def max_pool_2x2(x):
  return tf.nn.max_pool(x, ksize=[1, 2, 2, 1],
                        strides=[1, 2, 2, 1], padding='SAME')
# First Convolutional Layer
W_conv1 = weight_variable([5, 5, 1, 32])
b_conv1 = bias_variable([32])
class MatrixCreator:
    def __init__(self):
        """
        Initialize class variables
        :return: None
        """
        self.ds = DataSource()
        self.ids = []
        self.num_entries = 0
        self.num_articles = 0
        self.num_article_words = 0
        self.article_titles = []
        self.stopwords = set(nltk.corpus.stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()

    def get_num_entries(self):
        """
        Gets the number of non-zero entries in the matrix.
        :return: number of non-zero entries in the matrix
        """
        return self.num_entries

    def get_num_articles(self):
        """
        Gets the number of articles, which is the number of rows in the matrix
        :return: number of articles
        """
        return self.num_articles

    def get_num_article_words(self):
        """
        Gets the number of unique words across all documents
        :return: number of unique words across all documents
        """
        return self.num_article_words

    def get_article_titles(self):
        """
        Gets ordered list of article titles corresponding to
        article ids in self.ids
        :return: list of article titles
        """
        return self.article_titles

    def get_article_ids(self):
        """
        Gets article ids
        :return: list of article ids
        """
        return self.ids

    def retrieve_article_ids_titles_filenames(self):
        """
        Retrieves article ids, titles, and filenames from database
        and sets class variables
        :return: None
        """
        articles = self.ds.get_article_ids_titles_filenames()
        self.ids = []
        self.article_titles = []
        self.filenames = []
        for article in articles:
            if os.path.isfile(articles_path + article[2]):
                self.ids.append(article[0])
                self.article_titles.append(article[1])
                self.filenames.append(article[2])
        self.num_articles = len(self.article_titles)

    def get_article_text_by_article(self):
        """
        Gets list of sets of words by article, along with set of keywords
        across all articles
        :return: Set of words used in all articles
        """
        pattern = re.compile(r'TITLE:(.*)TEXT:(.*)', re.DOTALL)

        self.article_words_by_article = []
        all_article_words_set = set()

        for idx, filename in enumerate(self.filenames):
            article_file = open(articles_path + filename, "r", encoding="utf8")
            body = article_file.read()
            article_file.close()
            tagged_items = re.match(pattern, body)
            title_tagged = tagged_items.group(1)
            body_tagged = tagged_items.group(2)

            extractor = KeywordExtractor()
            title_text, _ = extractor.preprocess_keywords(title_tagged)
            body_text, _ = extractor.preprocess_keywords(body_tagged)
            body_text.extend(title_text)

            body_text = [
                Counter(sentence.strip().split()) for sentence in body_text
            ]
            body_text_counter = Counter()
            for sentence in body_text:
                body_text_counter.update(sentence)
                all_article_words_set.update(sentence.keys())

            self.article_words_by_article.append(body_text_counter)
        self.num_article_words = len(all_article_words_set)
        return all_article_words_set

    def construct_matrix(self):
        """
        Constructs an articles by words numpy array and populates it
        with tfidf values for each article-word cell.
        :return: tfidf matrix, or None if matrix empty (usually occurs when no articles found for some reason,
        (for example, if working directory is not root directory)
        """

        # Initialize article ids and titles
        self.retrieve_article_ids_titles_filenames()

        # Get keywords to construct matrix
        all_article_words_list = list(self.get_article_text_by_article())
        matrix = zeros((self.num_articles, self.num_article_words))
        num_entries = 0
        for article_word_idx, article_word in enumerate(
                all_article_words_list):
            for article_idx, article_id in enumerate(self.ids):
                if article_word in self.article_words_by_article[article_idx]:
                    matrix[article_idx,
                           article_word_idx] += self.article_words_by_article[
                               article_idx][article_word]
                    num_entries += 1
        self.num_entries = num_entries  # Count num entries to calculate K

        #if matrix is empty, we cannot use it
        if matrix.shape == (0, 0):
            return None
        transformer = TfidfTransformer()
        tfidf_matrix = transformer.fit_transform(matrix).toarray()
        return tfidf_matrix
import sys
import os
sys.path.insert(0, os.path.abspath('..'))
sys.path.insert(0, os.path.abspath('.'))

from flask import Flask, render_template, request, redirect
import subprocess
from Utils import subprocess_helpers
from Utils.DataSource import *

app = Flask(__name__)
dataSource = DataSource()


def launch_preprocessors():
    process = subprocess.Popen(subprocess_helpers.python_path +
                               " Daemons/QueryProcessorDaemon.py && " +
                               subprocess_helpers.python_path +
                               " Daemons/ArticleProcessorDaemon.py",
                               executable=subprocess_helpers.executable,
                               shell=True,
                               universal_newlines=True)


@app.route("/", methods=["GET"])
def queries():
    # Get lists of query from database with counts of associated articles
    all_queries = dataSource.queries_route()
    queries_formatted = [{
        "id": q[0],
        "subject": q[1],