def set_collection_length(self, database): """ Determines the length of document collection for self.collection_length attribute of the class. """ db = shelve(database, 'r') temp_file = shelve('temp/terms_integerid','r') self.vec_length = len(temp_file['id2term']) return len(db.keys())
def analyze(self, url, links): """creates a document and sets its outgoing links """ self.db = shelve('database1', 'w') key = md5(url).hexdigest() #if the document is already in the database, just add its outgoing links if key in self.db.iterkeys(): doc = self.db[key] doc.insertOL(links) doc.url = url document = open(self.retriever.filename(url)).read() doc.all_terms = self.term_extractor.get_terms(document) unique_terms = self.term_extractor.get_unique_terms(document) doc.unique_terms_freq = self.term_extractor.count_term_frequencies( unique_terms, document) #print self.db[key].outgoingLinks #if there is no document for the url, create a document and add its outgoing links if key not in self.db.iterkeys(): newDoc = Document(url) newDoc.insertOL(links) newDoc.url = url document = open(self.retriever.filename(url)).read() newDoc.all_terms = self.term_extractor.get_terms(document) unique_terms = self.term_extractor.get_unique_terms(document) newDoc.unique_terms_freq = self.term_extractor.count_term_frequencies( unique_terms, document) self.db[key] = newDoc #print self.db[key].outgoingLinks #self.extractLinksfromResponse(url,links) self.db.close()
def yield_documents(self): """ Generator to return a document from the database """ db = shelve(self.db, 'w') for document in db.itervalues(): yield document
def get_all_terms(self, pod = 0.6): """ Iterate through the documents and retrieve the list of all the terms in the corpus of text. pod is the percentage of documents that we want to collect the terms for. """ self.database = shelve(self.db, 'w') no_of_docs_in_database = 0 for key in self.database.iterkeys(): no_of_docs_in_database += 1 no_of_docs = int(pod* no_of_docs_in_database) def yield_all_terms(): """ Generator object which yields the all_terms attributes of the documents in the database """ for key in self.database.iterkeys(): yield self.database[key].all_terms terms_generator = yield_all_terms() all_terms = terms_generator.next() #print vector,len(vector) for i in xrange(1, no_of_docs): #print terms_generator.next(),len(terms_generator.next()) all_terms.extend(terms_generator.next()) self.database.close() return (all_terms)
def analyze(self, url, links): """creates a document and sets its outgoing links """ self.db = shelve("database1", "w") key = md5(url).hexdigest() # if the document is already in the database, just add its outgoing links if key in self.db.iterkeys(): doc = self.db[key] doc.insertOL(links) doc.url = url document = open(self.retriever.filename(url)).read() doc.all_terms = self.term_extractor.get_terms(document) unique_terms = self.term_extractor.get_unique_terms(document) doc.unique_terms_freq = self.term_extractor.count_term_frequencies(unique_terms, document) # print self.db[key].outgoingLinks # if there is no document for the url, create a document and add its outgoing links if key not in self.db.iterkeys(): newDoc = Document(url) newDoc.insertOL(links) newDoc.url = url document = open(self.retriever.filename(url)).read() newDoc.all_terms = self.term_extractor.get_terms(document) unique_terms = self.term_extractor.get_unique_terms(document) newDoc.unique_terms_freq = self.term_extractor.count_term_frequencies(unique_terms, document) self.db[key] = newDoc # print self.db[key].outgoingLinks # self.extractLinksfromResponse(url,links) self.db.close()
def get_all_terms(self, pod=0.6): """ Iterate through the documents and retrieve the list of all the terms in the corpus of text. pod is the percentage of documents that we want to collect the terms for. """ self.database = shelve(self.db, 'w') no_of_docs_in_database = 0 for key in self.database.iterkeys(): no_of_docs_in_database += 1 no_of_docs = int(pod * no_of_docs_in_database) def yield_all_terms(): """ Generator object which yields the all_terms attributes of the documents in the database """ for key in self.database.iterkeys(): yield self.database[key].all_terms terms_generator = yield_all_terms() all_terms = terms_generator.next() #print vector,len(vector) for i in xrange(1, no_of_docs): #print terms_generator.next(),len(terms_generator.next()) all_terms.extend(terms_generator.next()) self.database.close() return (all_terms)
def set_doc_vector(self): """ Iterates through the documents and sets their document vectors """ doc_generator = self.yield_documents() documents = True doc_freq = shelve('doc_frequencies', 'w') shelve('documentVectors', 'c') keyword_database = shelve('temp/terms_to_integer', 'r') keywords = keyword_database['term2id'] dv = shelve('documentVectors', 'w') db = shelve(self.db, 'w') for document in db.itervalues(): key = document.key doc_terms = document.unique_terms_freq tf = 0 doc_vector = zeros(self.vec_length) for kw in keywords: if kw in doc_terms.keys(): tf = doc_terms[kw] term_weight = mathutils.calculate_term_weight(tf, doc_freq[kw], self.vec_length) doc_vector[keywords[kw]] = term_weight doc_vector = mathutils.normalise_vector(doc_vector) print doc_vector dv[key] = doc_vector
def calculate_document_frequency(self): """ Calculate the document frequencies of the tokens. Document frequency is the number of documents in which a keyword appears. This will be used to calculate the inverse document frequencies. """ self.get_document_keywords() shelve('doc_frequencies','c') doc_freq = shelve('doc_frequencies','w') keywords = self.yield_keyword() for i in xrange(0,self.vec_length): count = 0 kw = keywords.next() doc_generator = self.yield_documents() flag = True while flag: try: if kw in doc_generator.next().all_terms: count+=1 except StopIteration: flag = False doc_freq[kw] = count print kw,count doc_freq.close()
def assign_id_to_terms(self, list_of_terms): """ Assign an integer id to all the terms for mathematical manipulation. We save the dictionary of integers mapped to the terms in a database for future reference. id2term stands for mapping of integer to a term. term2id stands for term mapped to an integer. """ int_to_terms = dict(enumerate(list_of_terms)) terms_to_int = dict([(int_to_terms[key],key) for key in int_to_terms]) shelve('temp/terms_integerid', 'c') shelve('temp/terms_to_integer', 'c') temp_file1 = shelve('temp/terms_to_integer', 'w') temp_file = shelve('temp/terms_integerid', 'w') temp_file['id2term'] = int_to_terms temp_file1['term2id'] = terms_to_int temp_file.close() return terms_to_int
def assign_id_to_terms(self, list_of_terms): """ Assign an integer id to all the terms for mathematical manipulation. We save the dictionary of integers mapped to the terms in a database for future reference. id2term stands for mapping of integer to a term. term2id stands for term mapped to an integer. """ int_to_terms = dict(enumerate(list_of_terms)) terms_to_int = dict([(int_to_terms[key], key) for key in int_to_terms]) shelve('temp/terms_integerid', 'c') shelve('temp/terms_to_integer', 'c') temp_file1 = shelve('temp/terms_to_integer', 'w') temp_file = shelve('temp/terms_integerid', 'w') temp_file['id2term'] = int_to_terms temp_file1['term2id'] = terms_to_int temp_file.close() return terms_to_int
def __init__(self): shelve('database1', 'c') self.term_extractor = parser.ExtractTerms() self.retriever = Retriever()
def yield_keyword(self): temp_file = shelve('temp/terms_integerid','w') for value in temp_file['id2term'].itervalues(): yield value
from datetime import timedelta, time, datetime, date from tkinter import * from os import popen from shelve import open as shelve from time import sleep from pyperclip import copy from json import loads from functools import partial from subprocess import Popen from sys import exit today = date.today() day = today.weekday() if day == 6 or day == 5: exit(0) #sunday or saturday memory = shelve('holidays') try: #check holiday memory[today.strftime('%d-%m-%Y')] memory.close() exit(0) except KeyError: memory.close() week = ('Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday') with open('settings.json') as f: memory = loads(f.read()) wait = timedelta(minutes=memory['alert']) start_meeting = partial(Popen, [memory['exe_path']]) cmd_class = memory['command_after_class'] cmd_day = memory['command_after_day'] over = memory['over by'] over = datetime.combine(today, time(hour=int(over[:2]), minute=int(over[3:]))) dow = week[day]
def __init__(self): shelve("database1", "c") self.term_extractor = parser.ExtractTerms() self.retriever = Retriever()
""" This module implements the class for handling the user queries. temp/terms_to_integer stores the basis vector in a dictionary with the terms mapped to integers. """ from shelve import DbfilenameShelf as shelve from Models.document import Document from porter import PorterStemmer import mathutils from counter import Counter from numpy import zeros from Models.document import Document from numpy import dot keyword_database = shelve('temp/terms_to_integer', 'r') keywords = keyword_database['term2id'].keys() vec_length = len(keywords) porter_stemmer = PorterStemmer() dv = shelve('documentVectors', 'r') doc_database = shelve('database1', 'r') def query_parser(query): """ The query string is split into words or terms. The terms are then checked if they are present in our basis vector. The terms which are found in the basis vector are then mapped to their integer ids and returned as a vector. """
""" This module implements the class for handling the user queries. temp/terms_to_integer stores the basis vector in a dictionary with the terms mapped to integers. """ from shelve import DbfilenameShelf as shelve from Models.document import Document from porter import PorterStemmer import mathutils from counter import Counter from numpy import zeros from Models.document import Document from numpy import dot keyword_database = shelve('temp/terms_to_integer', 'r') keywords = keyword_database['term2id'].keys() vec_length = len(keywords) porter_stemmer = PorterStemmer() dv = shelve('documentVectors', 'r') doc_database = shelve('database1', 'r') def query_parser(query): """ The query string is split into words or terms. The terms are then checked if they are present in our basis vector. The terms which are found in the basis vector are then mapped to their integer ids and returned as a vector. """ query_terms = query.split()