コード例 #1
0
 def handler(postingList):
     """
         @describe: 将磁盘上的倒排记录表加载到内存
         @param postingList: 一项倒排记录表磁盘存储项,包括词项、文档频率、高低端倒排记录表
     """
     # 初始化
     term = postingList['term']
     df = postingList['df']
     highPostingList = postingList['highPostingList']
     lowPostingList = postingList['lowPostingList']
     # 定义倒排词项
     hash[term] = Term()
     hash[term].df = df
     # 高端倒排记录表整理
     if len(highPostingList) > 0:
         link = Link()
         for posting in highPostingList:
             link.append(
                 Posting(posting['docID'], posting['tf'],
                         posting['static_grade'], posting['time']))
         hash[term].high_posting_list = link.head
     # 低端倒排记录表整理
     if len(lowPostingList) > 0:
         link = Link()
         for posting in lowPostingList:
             link.append(
                 Posting(posting['docID'], posting['tf'],
                         posting['static_grade'], posting['time']))
         hash[term].low_posting_list = link.head
コード例 #2
0
ファイル: main.py プロジェクト: ndriker/searchEngine
def indexer(inverted_index):
    doc_id_file = open("doc_ids_urls.txt", "w")
    n = 0

    #/home/fghiasi/M1_project/searchEngine/examples/aiclub_ics_uci_edu
    #/home/fghiasi/inf141Proj2_last_update/inf141Proj2/Assignment3/DEV
    #dev_path = "C:\\Users\\NoobMaster69\\Desktop\\School\\CS 121 - Info Retrieval\\Assignments\\3-Search-Engine\\M1\\developer\\DEV"
    dev_path = "doh"
    documents = searching_all_files(dev_path)
    #documents = searching_all_files('/home/fghiasi/inf141Proj2_last_update/inf141Proj2/Assignment3/DEV')
    # documents = ['/home/fghiasi/M1_project/searchEngine/examples/aiclub_ics_uci_edu/8ef6d99d9f9264fc84514cdd2e680d35843785310331e1db4bbd06dd2b8eda9b.json']

    for document in documents:
        n += 1
        content = extract_json_content(document, 'content')
        url = extract_json_content(document, 'url')

        doc_id_url_str = "{} {}\n".format(n, url)
        doc_id_file.write(doc_id_url_str)

        text = tokenize(content)
        word_freq = computeWordFrequencies(text)

        for i, token in enumerate(text):
            if token not in inverted_index:
                inverted_index[token] = []
            inverted_index[token].append(Posting(n, word_freq[token], i))
            # print(token, " ", Posting(n, word_freq[token], i))
    doc_id_file.close()
    return inverted_index
コード例 #3
0
def build_index():
    """
    Builds and index from a given folder.

    Normalizes the documents, tokezine them, and create the index.

    This function is called only when the user has provided a wrong
    index file, or even when it did not provide anything at all.
    """

    processors.append(NormalizerProcessor())

    # Fetches every documents from the input folder
    print('[FETCHING]\tReading text files from \'{0}\'...'.format(folder))
    documents = Document.fetch(folder, True)

    # Normalizes every loaded documents
    print('[PROCESSING]\tNormalizing words from every documents...')
    tokenize_all(documents)

    # Creates the index by mapping every word
    # to all the documents that reference it
    print('[INDEXING]\tBuilding index from words...\n')
    posting_list = Posting.index(tokenized_documents)
    index = Index.build(posting_list)

    return index
コード例 #4
0
ファイル: parser.py プロジェクト: blobsey/burritosearcher
def finalizeIndex(threadNum, N, duplicates):
    #load a partialIndex, it contains a dict in format {token:list of tuples}
    #tuple is in the format (docid, wordfreq)
    #ex. to access the word freq do partialIndex[token][1]
    with open(tempIndexPath + str(threadNum) + ".tmp", "rb") as file:
        with open(indexPath + str(threadNum) + ".p", "wb") as dump:
            partialIndex = pickle.load(file)
            postings = defaultdict(list)
            positions = dict(
            )  #contains seek positions for each token in index
            #construct posting list for each token in partialIndex
            for token in partialIndex:
                positions[token] = dump.tell(
                )  #record the seek position for ez access later
                #calculate tfidf, construct posting list with docid, termfreq, tfidf
                for tokentuple in partialIndex[token]:
                    if tokentuple[0] in duplicates:
                        continue
                    tf = math.log10(tokentuple[1]) + 1
                    df = len(partialIndex[token])
                    posting = Posting(tokentuple[0], tokentuple[1],
                                      tf * math.log10(N / df))
                    postings[token].append(posting)
                pickle.dump(postings[token], dump, pickle.HIGHEST_PROTOCOL)

    del partialIndex
    gc.collect()

    #dump the positions dict
    with open(indexPath + str(threadNum) + ".positions", "wb") as dump:
        pickle.dump(positions, dump)
コード例 #5
0
def search(dict_file: str, post_file: str, query_in: str, query_out: str):
    """
    Open all the file and the load the dictionary and the list of all the documents id
    Then, parse each query and create the respective AST that can evaluate itself
    Finally, print the query result in the out file

    *params*
        - dict_file The filename of the dictionary file
        - post_file The filename of the postings file
        - query_in The filename of the query file
        - query_out The filename of the output file
    """
    with open(dict_file, mode="rb") as dictionary_file,\
    open(post_file, mode="rb") as postings_file,\
    open(query_in, encoding="utf8") as q_in,\
    open(query_out, mode="w", encoding="utf8") as q_out:
        dictionary = pickle.load(dictionary_file)
        posting = Posting(dictionary, postings_file)
        file_list = posting['__all__']
        for query in q_in:
            print(" ".join(
                map(
                    str,
                    shunting_yard_AST(tokenize(query)).eval(
                        posting, file_list).list)),
                  end='\n',
                  file=q_out)
コード例 #6
0
 def reduce(self):
     for tok_doc in self.tok_docs:
         for token, frequency in tok_doc.tokens_freq.items():
             posting = Posting(tok_doc.document.doc_id,
                               freq=frequency,
                               occ=tok_doc.tokens_occ[token])
             # print(posting.__dict__)
             self.reduced_terms[token].append(posting.__dict__)
     return self.reduced_terms
コード例 #7
0
ファイル: serializer.py プロジェクト: tonalan/Inf141_indexer
def deserializeIndexItem(line):
    line = line.split(" ")
    postings = []

    for i in range(2, len(line)):
        if i % 2 == 0:
            postings.append(Posting(line[i], int(line[i + 1])))

    return line[0], line[1], postings
コード例 #8
0
def reduce_tok_doc(tok_doc, reduced_terms=None, documents=None):
    reduced_terms = defaultdict(
        list) if reduced_terms is None else reduced_terms
    for token, frequency in tok_doc.tokens_freq.items():
        posting = Posting(tok_doc.document.doc_id,
                          freq=frequency,
                          occ=tok_doc.tokens_occ[token])
        # print(posting.__dict__)
        reduced_terms[token].append(posting.__dict__)
    return reduced_terms
コード例 #9
0
    def _expand_inverted_index(self, quoted_phrases, expansion):
        inv_idx = self.inverted_index

        for phrase in quoted_phrases:
            phrase = word_tokenize(phrase)
            # Create a new term representing the phrase and add it to the expansion
            new_term = "_".join(phrase)
            expansion[new_term] = list()
            # Get the doc_ids in the lines for each term in the phrase
            term_lines = [[posting for posting in inv_idx[word]]
                          for word in phrase]

            # Select the documents which contain all the phrase's words
            filtered_postings = []
            intersection = self._intersection(term_lines, filtered_postings)
            while len(intersection) > 0:
                # Occurrences is a list of lists (matrix), where each line i contains
                # the occurrences of the ith word of the phrase in the document
                # intersection[0].
                occurrences = [
                    postings_list[0].positions
                    for postings_list in filtered_postings
                ]
                # Subtract from all elements in each line of occurrences the line
                # index (line 0 is ignored).
                for i in range(1, len(occurrences)):
                    occurrences[i] = [(occurrence - i)
                                      for occurrence in occurrences[i]]
                # The positions in which there are intersections are occurrences
                # of the whole phrase in the document.
                phrase_occurrences = self._intersection(occurrences)
                # If there are occurrences, create a Posting for the document in
                # expansion[new_term]
                if len(phrase_occurrences) > 0:
                    new_posting = Posting(intersection[0])
                    new_posting.positions = deepcopy(phrase_occurrences)
                    expansion[new_term].append(new_posting)
                # Delete the heads of intersection and filtered_posting's lines
                for postings_list in filtered_postings:
                    del postings_list[0]
                del intersection[0]
コード例 #10
0
def readQuery(city, queryEmail):
    '''
    Queries a city, and parses the result into a list
    of Posting objects.

    Args:
    city: A string representation of the city to query
    queryEmaiL: a boolean whether to search for the reply_to e-mail
    associated with each post.  This (unfortunately) is quite slow.

    returns:
    postingsObject: A list of postings.  
    '''
    if city == 'test':
        #A stored xml file
        city = 'seattle'
        f = open('files/seattle.xml')
        minneapolisString = f.read()
        f.close()
        dom = parseString(minneapolisString)
    else:
        dom = __parseQuery(city)

    domItems = dom.getElementsByTagName('item')
    postings = []
    for i, item in enumerate(domItems):
        title = item.getElementsByTagName('title')[0].childNodes[0].wholeText
        postingDate = item.getElementsByTagName(
            'dc:date')[0].childNodes[0].wholeText
        permalink = item.getElementsByTagName(
            'link')[0].childNodes[0].wholeText
        #Get the reply to field
        #Currently commented out, due to speed issues.
        #TODO: Change this to work
        if (queryEmail):
            print i
            permalinkPage = urllib.urlopen(permalink)
            permalinkHTML = permalinkPage.read()
            permalinkPage.close()
            match = re.search('(mailto:)([^?]+)', permalinkHTML)
            if match:
                mailToLink = match.group(2)
            else:
                mailToLink = "emailnotfound"
        else:
            mailToLink = "emailnotfound"
        shortDescription = item.getElementsByTagName(
            'description')[0].childNodes[0].wholeText
        postings.append(
            Posting(city, title, permalink, shortDescription, postingDate,
                    mailToLink))
    postingsObject = Postings(postings)
    return postingsObject
コード例 #11
0
    def _expand_inverted_index(self, quoted_phrases, expansion):
        inv_idx = self.inverted_index
        
        for phrase in quoted_phrases:
            phrase = word_tokenize(phrase)
            # Create a new term representing the phrase and add it to the expansion
            new_term = "_".join(phrase)
            expansion[new_term] = list()
            # Get the doc_ids in the lines for each term in the phrase
            term_lines = [[posting for posting in inv_idx[word]] for word in phrase]

            # Select the documents which contain all the phrase's words
            filtered_postings = []
            intersection = self._intersection(term_lines, filtered_postings)
            while len(intersection) > 0:
                # Occurrences is a list of lists (matrix), where each line i contains
                # the occurrences of the ith word of the phrase in the document
                # intersection[0].
                occurrences = [postings_list[0].positions for postings_list in filtered_postings]
                # Subtract from all elements in each line of occurrences the line
                # index (line 0 is ignored).
                for i in range(1, len(occurrences)):
                    occurrences[i] = [(occurrence-i) for occurrence in occurrences[i]]
                # The positions in which there are intersections are occurrences
                # of the whole phrase in the document.
                phrase_occurrences = self._intersection(occurrences)
                # If there are occurrences, create a Posting for the document in 
                # expansion[new_term] 
                if len(phrase_occurrences) > 0:
                    new_posting = Posting(intersection[0])
                    new_posting.positions = deepcopy(phrase_occurrences)
                    expansion[new_term].append(new_posting)
                # Delete the heads of intersection and filtered_posting's lines
                for postings_list in filtered_postings:
                    del postings_list[0]
                del intersection[0]
コード例 #12
0
 def handler(weibo):
     """ 处理微博 """
     tokens = jieba.cut_for_search(weibo['mt'])
     # 微博静态评分计算:a*log(转发数)+b*log(评论数)
     static_grade = math.log(weibo['rc'] + 1) + math.log(weibo['cc'] +
                                                         1)
     # 对生成的词条进行归一化处理
     self.__normalization(weibo['_id'], tokens, static_grade,
                          weibo['ct'])
     # 添加nickname到nickname倒排链表
     nicknames = weibo['nc']
     nicknames.append(weibo['sn'])
     for nickname in nicknames:
         self.__addNickNamePosting(
             nickname,
             Posting(weibo['_id'], 1, static_grade, weibo['ct']))
コード例 #13
0
ファイル: main.py プロジェクト: ndriker/searchEngine
def get_postings(tokenized_word, file_ptr):
    # file_ptr should be inverted_index
    postings = []

    for line in file_ptr:  #"workshop\n"
        line_txt = line.strip("\n")  #"workshop"
        if line_txt == tokenized_word:  #we found the token
            while line_txt != '$':
                line_txt = file_ptr.readline().strip("\n")
                if line_txt != "$":
                    p_values = line_txt.strip().split(
                        ',')  # 1,22,1035 -> ['1', '22', '1035']
                    assert 3 == len(p_values)
                    postings.append(
                        Posting(p_values[0], p_values[1], p_values[2]))
            file_ptr.seek(0)
            return postings
    file_ptr.seek(0)
    return postings  # []
コード例 #14
0
    def add_tokens_of_doc(self, content, docId):
        """
        Processes content of this document and updates indices and postings lists.

        Params:
            - content: tokens in the document content
            - docId: document ID
        
        Returns:
            - normalised_tf: normalised length of document
        """
        tf_freq = {} # Term frequencies of this document

        position = 0 # position of token in content
        for token in content:
            if token not in self.terms:
                self.terms[token] = {}
                self.terms[token]["offset"] = None
                self.terms[token]["size"] = None
                self.terms[token]["docFreq"] = 1
                self.terms[token]["posting"] = Posting()
            else:
                if token not in tf_freq:
                    self.terms[token]["docFreq"] += 1

            self.terms[token]["posting"].add_doc_to_postings(docId)
            self.terms[token]["posting"].add_pos_to_doc(docId, position)

            position += 1

            if token in tf_freq:
                tf_freq[token] += 1
            else:
                tf_freq[token] = 1


        normalised_tf = 0
        for token in tf_freq.keys():
            freq = tf_freq[token]
            normalised_tf += pow((1 + util.log10(freq)), 2)

        return sqrt(normalised_tf)
コード例 #15
0
    def create_tokens(self, root, path, url):
        '''
        This function should return word tokens for a given file
        '''
        #http://nltk.org
        #https://pythonspot.com/tokenizing-words-and-sentences-with-nltk/ -has info on stop words and stemming

        with open(root + path, 'r') as myfile:
            soup = BeautifulSoup(myfile, 'lxml')

        # kill all script and style elements
        for script in soup(
            ["script", "style"]
        ):  #Source: https://stackoverflow.com/questions/22799990/beatifulsoup4-get-text-still-has-javascript
            script.decompose()  #rip it out

        #Get title
        try:
            title = self.cleanhtml(str(soup.find_all("title")[0]))
            print("PRINTING TITLE")
            print(title)
        except IndexError:
            title = ""

        raw_text = soup.get_text()

        raw_tokens = nltk.word_tokenize(raw_text)
        print("Compute RAW TOKENS")

        filtered_tokens = self.remove_stop_words(raw_tokens)
        print("Compute filtered_tokens")

        #Stemming of all the tokens gathered
        words_counter = self.create_stemmed_word_count_dictionary(
            filtered_tokens)
        print("Compute word_dict")
        for word, count in words_counter.items():
            if (self.is_ascii(word) and (not self.is_number(word))):
                if len(word) < 182 and len(
                        word) > 2:  #Can't have large strings for db keys
                    posting = Posting(path, url, title)
                    posting.set_frequency(count)
                    posting.set_length_of_doc(len(raw_tokens))
                    if word not in self.tokens.keys():
                        self.tokens[word] = [posting]
                    else:
                        self.tokens[word].append(posting)
        if path == "www.ics.uci.edu/faculty":
            print(words_counter)
コード例 #16
0
 def __normalization(self, docID, tokens, static_grade, time):
     """ 
         @describe: 对词条进行语言预处理,归一化,之后自动将归一化之后的词项添加到倒排索引中
         @param docID: 文档ID
         @param tokens: 文档中的所有词项
         @param static_grade: 文档的静态评分
         @param time: 微博发布时间
     """
     terms = {}  # 临时词典,用以存储该文档中的所有词及对应的tf
     for token in tokens:
         if token not in '‘~!@#$%^&*()_+{}|:"<>?`-=[]\;\',./ !¥……()——:“”《》?·【】、;‘’,。丶~→......的了是转发回复赞谢谢thttpV':
             # 过滤掉特殊字符
             if not terms.has_key(token):  # 该文档中第一次出现的词
                 terms[token] = 1
             else:
                 terms[token] += 1
     # 将归一化之后的词项添加到倒排记录表
     for term in terms:
         self.__addPosting(term,
                           Posting(docID, terms[term], static_grade,
                                   time))  # terms[term]中存储的是词项频率
コード例 #17
0
def indexDocument(url, content):
    indices = defaultdict(lambda: [])

    # Build the soup of the current html file
    soup = BeautifulSoup(content, "html.parser")

    # Tokenize the html and compute frequencies of the tokens
    words = tk.tokenize(soup.get_text())
    frequencies = tk.computeWordFrequencies(words)

    # Get the text that is bolded
    boldedText = getTextInTags(soup, "b")
    boldedText += getTextInTags(soup, "strong")

    # Get the text in the headers
    headerText = getTextInTags(soup, "header")
    for i in range(1, 7):
        headerText += " "
        headerText += getTextInTags(soup, f"h{i}")

    # Get the text in the title
    titleText = getTextInTags(soup, "title")

    # Loop through all token-frequencies
    for token, frequency in frequencies.items():
        score = frequency

        # Add to the score for its occurrences in important html tags
        score += boldedText.count(token) * 5
        score += headerText.count(token) * 10
        score += titleText.count(token) * 15

        # Add the posting to the current index
        indices[token].append(Posting(url, score))

    return indices
コード例 #18
0
def init_inverted_index():
    idx = 1
    doc_list = []
    inverted_index = InvertedIndex()

    while True:
        try:
            document = deserialize(str(idx)+".dbf")
            doc_list.append(document)
            idx  += 1;
        except IOError:
            break

    total = len(doc_list)
    inverted_index.n = total

    for document in doc_list:
        lower_doc = str(document).lower()
        tokens = nltk.word_tokenize(lower_doc)
        for pos in range(0,len(tokens)):
            tk = tokens[pos]

            if not tk in inverted_index:
                inverted_index[tk] = list()
            
            term_data = inverted_index[tk]
            if not document.id in map(lambda p: p.doc_id, term_data):
                term_data.append(Posting(document.id))

            for posting in term_data:
                if posting.doc_id == document.id:
                    posting.positions.append(pos)
                    break
        print "{0:.2f}% completed...".format(float(document.id)/total * 100)

    serialize(inverted_index,"inverted_index.idx")
コード例 #19
0
    dialog_cnpj_padrao.txt_cnpj_padrao.textChanged.connect(
        on_cnpj_padrao_alterado)
    dialog_cnpj_padrao.txt_cnpj_padrao.keyPressEvent = txt_cnpj_padrao_keyPressEvent

    ui.btn_cnpj.clicked.connect(mostra_dialogCnpj)

    dialog_cnpj_padrao.btn_ok.clicked.connect(confirma_cnpj_padrao)
    dialog_cnpj_padrao.btn_cancel.clicked.connect(Dialog_Cnpj.reject)

    ui.txt_num_notas.setValidator(
        QtGui.QIntValidator(constant.MIN_NOTAS, constant.MAX_NOTAS))
    ui.txt_num_notas.setText(str(constant.DEFAULT_NUMERO_NOTAS))

    ui.txt_cnpj_estab.setText(constant.EMPTY_STR)

    servico_posting = Posting()
    ui.btn_postar.clicked.connect(on_abre_postar)

    m = Messages()

    lista_notas = []
    lista_cnpj = []

    mes_tipo = 1  #mes atual
    mes_sel = define_mes_padrao()
    mes_sel_int = define_mes_padrao_int()

    if constant.INICIA_DB_INICIO:
        init_db()

    #cria lista (notas e empresas)
コード例 #20
0
        while posting is not None:
            if i < self.topK:
                docIDs.append(posting.docID)
            i = i + 1
            posting = posting.next
        return i, docIDs

    def normalization(self, tfsList):  # log and normalization
        length = 0
        for i in xrange(0, len(tfsList)):
            length = length + (1 + math.log10(tfsList[i])) * (
                1 + math.log10(tfsList[i]))
        length = math.sqrt(length)
        for i in xrange(0, len(tfsList)):
            tfsList[i] = (1 + math.log10(tfsList[i])) / length


if __name__ == "__main__":
    from term import Term
    from posting import Posting
    terms = [Term(100), Term(10), Term(1)]
    posting4 = Posting('doc4', [1, 1, 2], 100, 0.001, None)
    posting3 = Posting('doc3', [2, 1, 2], 200, 0.003, posting4)
    posting2 = Posting('doc2', [2, 2, 2], 110, 0.004, posting3)
    posting = Posting('doc1', [3, 3, 3], 900, 0.005, posting2)
    temp = Ranker()
    docid = temp.rankByRelevancy(terms, posting)
    for doc in docid:
        print doc
    #just for test
コード例 #21
0
def processDirectory(dirpath, filenames, curr_docid):

    # PorterStemmer for tokenizing
    p = PorterStemmer()

    # index will be the inverted index. It will be offloaded at several points in the program.
    index = defaultdict(list)

    # freq is a dictionary that stores the frequency (tf) of each term. Cleared every time a file is finished with parsing.
    freq = defaultdict(int)

    # docid map to be returned
    did = {}
    for f in filenames:
        file = open(dirpath + '\\' + f, 'r')
        l = file.readline()

        # Tries to load json, with an except statement triggered if there is a Value Error. Should never be triggered.
        try:
            json_dict = json.loads(l)
        except ValueError:
            print('Loading file ' + str(dirpath) + str(f) + ' has failed')

        soup = BeautifulSoup(json_dict['content'], features='lxml')

        # Updates doc_id map with new URL
        did[curr_docid] = tuple([json_dict['url'], len(soup.get_text())])

        # Suppress BeautifulSoup warnings about URLs in text.
        warnings.filterwarnings("ignore", category=UserWarning, module='bs4')

        # Parsing section. Essentially checks to make sure that a stem is greater than 2 characters, but not completely composed of numbers.
        # Temporarily stores the frequency of each word as the tfidf. Updated at the end of indexing.
        for w in nltk.tokenize.word_tokenize(soup.get_text()):
            freq[p.stem(w)] += 1
        for i, j in freq.items():
            if (re.match("^[a-zA-Z0-9][a-zA-Z0-9]+$", i)
                    and (not re.match("^[0-9][0-9]+$", i))):
                index[i].append(Posting(curr_docid, float(j), 1, 0))

        # Special Weighing for bold words.
        for w in soup.find_all('b'):
            for t in nltk.tokenize.word_tokenize(w.text):
                if (index.get(t) != None):
                    index[t][-1].setpriority(2)

        # Special weighing for headers.
        for w in soup.find_all(re.compile('^h[1-6]$')):
            for t in nltk.tokenize.word_tokenize(w.text):
                if (index.get(t) != None):
                    index[t][-1].setpriority(3)

        # Special weighing for titles.
        for w in soup.find_all('title'):
            for t in nltk.tokenize.word_tokenize(w.text):
                if (index.get(t) != None):
                    index[t][-1].setpriority(4)

        curr_docid += 1
        file.close()
        freq.clear()
    dump(index, os.path.basename(dirpath))
    return did
コード例 #22
0
from posting import Posting

try:
    import cPickle as pickle
except ImportError:
    import pickle

stemmer = PorterStemmer()
stopwords = {stemmer.stem(i) for i in stopwords.words("english")}
print({stemmer.stem(w.lower()) for w in
                {word for sent in sent_tokenize("government's\nbusiness")  # (re.sub("[-']", " ", file.read()))
                 for word in word_tokenize(sent)}})
with open("dictionary.txt", mode="rb") as dictionary_file,\
     open("postings.txt", mode="rb") as postings_file:
    dictionary = pickle.load(dictionary_file)
    posting = Posting(dictionary, postings_file)
    counter_numbers = 0
    counter_number_disksize = 0
    counter_stopwords = 0
    counter_disksize = 0
    counter_possessive = 0
    counter_dash = 0
    for k in dictionary:
        if re.match(r'[\d.,/]+', k): # Number and dates
            counter_numbers += 1
            counter_number_disksize += dictionary[k].size
        elif re.match(r'.*\'.*', k): # Containing an hypen
            # print(k, posting[k].list)
            counter_possessive += 1
        elif re.match(r'.*-.*', k): # Containing a dash
            # print(k, posting[k].list)
コード例 #23
0
def scrape():
    '''
    Driving function to web scrape

    Parameters:
        None

    Returns:
        None
    '''
    #Loads urls
    print("Running Auto Job Application Tracker")
    print("Attempting to open insert.txt")
    try:
        new_applications = open("Config/Insert.txt", "r")
        urls = new_applications.read().splitlines()
        new_applications.close()

        total = len(urls)
        if total == 0:
            print(
                "Insert.txt is empty, please make sure you save the file before running this script."
            )
            print("Exiting script...")
            sys.exit(1)
        else:
            print("Found {} Urls, proceeding to scrape data".format(total))

        for num, job_url in enumerate(urls, 1):
            print("--------------------------------")
            print("Parsing application {}/{}".format(num, total))
            try:
                #Open connection to job url
                uClient = uReq(job_url)

                #Html Parsing
                page_soup = soup(uClient.read(), "html.parser")

                uClient.close()
            except:
                print("Incorrect url, please check: {}".format(job_url))
                empty = Posting(url=job_url)
                empty.set_empty()
                empty.append()
            else:
                #Creates appropriate posting object
                if "indeed" in job_url:
                    ad = Posting("INDEED", job_url, page_soup,
                                 constants.INDEED_TITLE_TAG,
                                 constants.INDEED_TITLE_CLASS,
                                 constants.INDEED_COMPANY_TAG,
                                 constants.INDEED_COMPANY_CLASS,
                                 constants.INDEED_LOC_TAG,
                                 constants.INDEED_LOC_CLASS)
                elif "linkedin" in job_url:
                    ad = Posting("LINKEDIN", job_url, page_soup,
                                 constants.LINKEDIN_TITLE_TAG,
                                 constants.LINKEDIN_TITLE_CLASS,
                                 constants.LINKEDIN_COMPANY_TAG,
                                 constants.LINKEDIN_COMPANY_CLASS,
                                 constants.LINKEDIN_LOC_TAG,
                                 constants.LINKEDIN_LOC_CLASS)
                elif "workopolis" in job_url:
                    ad = Posting("WORKOPOLIS", job_url, page_soup,
                                 constants.WORKOPOLIS_TITLE_TAG,
                                 constants.WORKOPOLIS_TITLE_CLASS,
                                 constants.WORKOPOLIS_COMPANY_TAG,
                                 constants.WORKOPOLIS_COMPANY_CLASS,
                                 constants.WORKOPOLIS_LOC_TAG,
                                 constants.WORKOPOLIS_LOC_CLASS)
                elif "glassdoor" in job_url:
                    ad = Posting("GLASSDOOR", job_url, page_soup,
                                 constants.GLASSDOOR_TITLE_TAG,
                                 constants.GLASSDOOR_TITLE_CLASS,
                                 constants.GLASSDOOR_COMPANY_TAG,
                                 constants.GLASSDOOR_COMPANY_CLASS,
                                 constants.GLASSDOOR_LOC_TAG,
                                 constants.GLASSDOOR_LOC_CLASS)
                else:
                    print("URL not supported")
                    continue

                ad.find_data()
                ad.find_skills()
                ad.append()

    except FileNotFoundError:
        print("File does not exist, creating that file now...")
        print("Please copy your URLs into 'insert.txt'")
        new_applications = open("Config/Insert.txt", "w")
        print("Exiting script...")
        sys.exit(1)