Example #1
0
 def set_collection_length(self, database):
     """
     Determines the length of document collection for
     self.collection_length attribute of the class.
     """
     db = shelve(database, 'r')
     temp_file = shelve('temp/terms_integerid','r')
     self.vec_length = len(temp_file['id2term'])
     return len(db.keys())
Example #2
0
    def analyze(self, url, links):
        """creates a document and sets its outgoing links
      """
        self.db = shelve('database1', 'w')
        key = md5(url).hexdigest()
        #if the document is already in the database, just add its outgoing links

        if key in self.db.iterkeys():
            doc = self.db[key]
            doc.insertOL(links)
            doc.url = url
            document = open(self.retriever.filename(url)).read()
            doc.all_terms = self.term_extractor.get_terms(document)
            unique_terms = self.term_extractor.get_unique_terms(document)
            doc.unique_terms_freq = self.term_extractor.count_term_frequencies(
                unique_terms, document)
            #print self.db[key].outgoingLinks
        #if there is no document for the url, create a document and add its outgoing links
        if key not in self.db.iterkeys():
            newDoc = Document(url)
            newDoc.insertOL(links)
            newDoc.url = url
            document = open(self.retriever.filename(url)).read()
            newDoc.all_terms = self.term_extractor.get_terms(document)
            unique_terms = self.term_extractor.get_unique_terms(document)
            newDoc.unique_terms_freq = self.term_extractor.count_term_frequencies(
                unique_terms, document)
            self.db[key] = newDoc
            #print self.db[key].outgoingLinks
        #self.extractLinksfromResponse(url,links)
        self.db.close()
Example #3
0
 def yield_documents(self):
         """
         Generator to return a document from the database
         """
         db = shelve(self.db, 'w')
         for document in db.itervalues():
             yield document
Example #4
0
 def get_all_terms(self, pod = 0.6):
     """
     Iterate through the documents and retrieve the list of all the terms in
     the corpus of text. pod is the percentage of documents that we want to 
     collect the terms for.  
     """
     self.database = shelve(self.db, 'w')
     no_of_docs_in_database = 0
     for key in self.database.iterkeys():
         no_of_docs_in_database += 1
     no_of_docs = int(pod* no_of_docs_in_database)
   
     def yield_all_terms():
         """
         Generator object which yields the all_terms attributes of the
         documents in the database
         """
         for key in self.database.iterkeys():
             yield self.database[key].all_terms
   
     terms_generator = yield_all_terms()
     all_terms = terms_generator.next()
     #print vector,len(vector)
     for i in xrange(1, no_of_docs):
      #print terms_generator.next(),len(terms_generator.next())
         all_terms.extend(terms_generator.next())
      
     self.database.close()  
     return (all_terms) 
Example #5
0
    def analyze(self, url, links):
        """creates a document and sets its outgoing links
      """
        self.db = shelve("database1", "w")
        key = md5(url).hexdigest()
        # if the document is already in the database, just add its outgoing links

        if key in self.db.iterkeys():
            doc = self.db[key]
            doc.insertOL(links)
            doc.url = url
            document = open(self.retriever.filename(url)).read()
            doc.all_terms = self.term_extractor.get_terms(document)
            unique_terms = self.term_extractor.get_unique_terms(document)
            doc.unique_terms_freq = self.term_extractor.count_term_frequencies(unique_terms, document)
            # print self.db[key].outgoingLinks
        # if there is no document for the url, create a document and add its outgoing links
        if key not in self.db.iterkeys():
            newDoc = Document(url)
            newDoc.insertOL(links)
            newDoc.url = url
            document = open(self.retriever.filename(url)).read()
            newDoc.all_terms = self.term_extractor.get_terms(document)
            unique_terms = self.term_extractor.get_unique_terms(document)
            newDoc.unique_terms_freq = self.term_extractor.count_term_frequencies(unique_terms, document)
            self.db[key] = newDoc
            # print self.db[key].outgoingLinks
        # self.extractLinksfromResponse(url,links)
        self.db.close()
Example #6
0
    def get_all_terms(self, pod=0.6):
        """
        Iterate through the documents and retrieve the list of all the terms in
        the corpus of text. pod is the percentage of documents that we want to 
        collect the terms for.  
        """
        self.database = shelve(self.db, 'w')
        no_of_docs_in_database = 0
        for key in self.database.iterkeys():
            no_of_docs_in_database += 1
        no_of_docs = int(pod * no_of_docs_in_database)

        def yield_all_terms():
            """
            Generator object which yields the all_terms attributes of the
            documents in the database
            """
            for key in self.database.iterkeys():
                yield self.database[key].all_terms

        terms_generator = yield_all_terms()
        all_terms = terms_generator.next()
        #print vector,len(vector)
        for i in xrange(1, no_of_docs):
            #print terms_generator.next(),len(terms_generator.next())
            all_terms.extend(terms_generator.next())

        self.database.close()
        return (all_terms)
Example #7
0
    def set_doc_vector(self):
        """
        Iterates through the documents and sets their document vectors 
        """
        doc_generator = self.yield_documents()
        documents = True
        doc_freq = shelve('doc_frequencies', 'w')
        shelve('documentVectors', 'c')
        keyword_database = shelve('temp/terms_to_integer', 'r')
        keywords = keyword_database['term2id']
        dv = shelve('documentVectors', 'w')
        db = shelve(self.db, 'w')
        for document in db.itervalues():            
            key = document.key
            doc_terms = document.unique_terms_freq
    
            tf = 0
            doc_vector = zeros(self.vec_length)
            for kw in keywords:      
                if kw in doc_terms.keys():
                    tf = doc_terms[kw]
                    term_weight = mathutils.calculate_term_weight(tf, doc_freq[kw], self.vec_length)
                    doc_vector[keywords[kw]] = term_weight
 
            doc_vector = mathutils.normalise_vector(doc_vector)
            print doc_vector
            dv[key] = doc_vector
Example #8
0
 def calculate_document_frequency(self):
     """
     Calculate the document frequencies of the tokens. Document frequency is
     the number of documents in which a keyword appears. This will be used
     to calculate the inverse document frequencies.
     """
     self.get_document_keywords()
     shelve('doc_frequencies','c')
     doc_freq = shelve('doc_frequencies','w')        
     keywords = self.yield_keyword()
     for i in xrange(0,self.vec_length):
         count = 0
         kw = keywords.next()
         doc_generator = self.yield_documents()
         flag = True
         while flag:
             try:
                 if kw in doc_generator.next().all_terms:
                     count+=1
             except StopIteration:
                 flag = False
         doc_freq[kw] = count
         print kw,count
     doc_freq.close()
Example #9
0
 def assign_id_to_terms(self, list_of_terms):
     """
     Assign an integer id to all the terms for mathematical manipulation.
     We save the dictionary of integers mapped to the terms in a database 
     for future reference.
     id2term stands for mapping of integer to a term.
     term2id stands for term mapped to an integer.
     """
     int_to_terms = dict(enumerate(list_of_terms))
     terms_to_int = dict([(int_to_terms[key],key) for key in int_to_terms])
     shelve('temp/terms_integerid', 'c')
     shelve('temp/terms_to_integer', 'c')
     temp_file1 = shelve('temp/terms_to_integer', 'w')
     temp_file = shelve('temp/terms_integerid', 'w')
     temp_file['id2term'] = int_to_terms
     temp_file1['term2id'] = terms_to_int
     temp_file.close()    
     return terms_to_int
Example #10
0
 def assign_id_to_terms(self, list_of_terms):
     """
     Assign an integer id to all the terms for mathematical manipulation.
     We save the dictionary of integers mapped to the terms in a database 
     for future reference.
     id2term stands for mapping of integer to a term.
     term2id stands for term mapped to an integer.
     """
     int_to_terms = dict(enumerate(list_of_terms))
     terms_to_int = dict([(int_to_terms[key], key) for key in int_to_terms])
     shelve('temp/terms_integerid', 'c')
     shelve('temp/terms_to_integer', 'c')
     temp_file1 = shelve('temp/terms_to_integer', 'w')
     temp_file = shelve('temp/terms_integerid', 'w')
     temp_file['id2term'] = int_to_terms
     temp_file1['term2id'] = terms_to_int
     temp_file.close()
     return terms_to_int
Example #11
0
 def __init__(self):
     shelve('database1', 'c')
     self.term_extractor = parser.ExtractTerms()
     self.retriever = Retriever()
Example #12
0
 def yield_keyword(self):
         temp_file = shelve('temp/terms_integerid','w')
         for value in temp_file['id2term'].itervalues():
             yield value
Example #13
0
from datetime import timedelta, time, datetime, date
from tkinter import *
from os import popen
from shelve import open as shelve
from time import sleep
from pyperclip import copy
from json import loads
from functools import partial
from subprocess import Popen
from sys import exit
today = date.today()
day = today.weekday()
if day == 6 or day == 5: exit(0)  #sunday or saturday
memory = shelve('holidays')
try:  #check holiday
    memory[today.strftime('%d-%m-%Y')]
    memory.close()
    exit(0)
except KeyError:
    memory.close()
week = ('Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday')
with open('settings.json') as f:
    memory = loads(f.read())
    wait = timedelta(minutes=memory['alert'])
    start_meeting = partial(Popen, [memory['exe_path']])
    cmd_class = memory['command_after_class']
    cmd_day = memory['command_after_day']
    over = memory['over by']
    over = datetime.combine(today,
                            time(hour=int(over[:2]), minute=int(over[3:])))
dow = week[day]
Example #14
0
 def __init__(self):
     shelve("database1", "c")
     self.term_extractor = parser.ExtractTerms()
     self.retriever = Retriever()
Example #15
0
"""
This module implements the class for handling the user queries.

temp/terms_to_integer stores the basis vector in a dictionary with the
terms mapped to integers.
"""
from shelve import DbfilenameShelf as shelve
from Models.document import Document
from porter import PorterStemmer
import mathutils
from counter import Counter
from numpy import zeros
from Models.document import Document
from numpy import dot

keyword_database = shelve('temp/terms_to_integer', 'r')
keywords = keyword_database['term2id'].keys()
vec_length = len(keywords)
porter_stemmer = PorterStemmer()
dv = shelve('documentVectors', 'r')
doc_database = shelve('database1', 'r')


def query_parser(query):
    """
    The query string is split into words or terms. The terms are then
    checked if they are present in our basis vector. The terms which are
    found in the basis vector are then mapped to their integer ids and 
    returned as a vector.
    """
Example #16
0
"""
This module implements the class for handling the user queries.

temp/terms_to_integer stores the basis vector in a dictionary with the
terms mapped to integers.
"""
from shelve import DbfilenameShelf as shelve
from Models.document import Document
from porter import PorterStemmer
import mathutils
from counter import Counter
from numpy import zeros
from Models.document import Document
from numpy import dot

keyword_database = shelve('temp/terms_to_integer', 'r')
keywords = keyword_database['term2id'].keys()
vec_length = len(keywords)
porter_stemmer = PorterStemmer()
dv = shelve('documentVectors', 'r')
doc_database = shelve('database1', 'r')

def query_parser(query):
    """
    The query string is split into words or terms. The terms are then
    checked if they are present in our basis vector. The terms which are
    found in the basis vector are then mapped to their integer ids and 
    returned as a vector.
    """
   
    query_terms = query.split()