Exemple #1
0
 def _get_doc_info(self, doc_id):
     response = self.document_parser.ask(
         msg.build_request(method='get_doc', data={'doc_id': doc_id}))
     if response['status'] == 0:
         data = response['data']
         return data['title'], data['text'], data['author'], data['date']
     else:
         log.log_error("Could not get info for doc: {:}".format(
             response['error_msg']))
Exemple #2
0
 def _stem_word(self, word):
     response = self.document_parser.ask(
         msg.build_request(method='stem_word', data={'word': word}))
     if response['status'] == 0:
         return response['data']['stem']
     else:
         log.log_error("Orchestrator could not stem word: {:}".format(
             response['error_msg']))
         return ""
Exemple #3
0
 def _load_file(self, _file):
     try:
         log.log_info("DocumentParser.load_file parsing xml...")
         tree = ET.parse(_file)
         root = tree.getroot()
         log.log_info("Loading pages...")
         for page in root.findall('page'):
             page_id = int(page.find('id').text)
             log.log_info("Loading page: {:}".format(page_id))
             page_data = self._parse_xml_page(page)
             self.index_handler.ask(
                 msg.build_request(method='store_page',
                                   data={'page': page_data}))
         log.log_info("Done loading pages")
         self.parsed_docs.append(_file)
     except:
         log.log_error("DocumentParser.load_file error parsing xml")
         log.log_debug(traceback.format_exc())
    def on_start(self):
        log.log_info("Starting Orchestrator...")

        try:
            log.log_info("Starting IndexHandler...")
            self.index_handler = IndexHandler.start()
            log.log_info("IndexHandler started")
        except:
            log.log_error("Could not start IndexHandler")
            log.log_debug(traceback.format_exc())

        try:
            log.log_info("Starting DocumentParser...")
            self.document_parser = DocumentParser.start(self.index_handler)
            log.log_info("DocumentParser started")
        except:
            log.log_error("Could not start DocumentParser")
            log.log_debug(traceback.format_exc())

        try:
            log.log_info("Starting QueryProcessor...")
            self.query_processor = QueryProcessor.start(
                self.document_parser, self.index_handler)
            log.log_info("QueryProcessor started")
        except:
            log.log_error("Could not start QueryProcessor")
            log.log_debug(traceback.format_exc())

        log.log_info("Orchestrator started")
Exemple #5
0
 def _get_doc_info(self, doc_id):
     for f in self.parsed_docs:
         log.log_info("Getting info from doc: {:}".format(f))
         tree = ET.parse(f)
         root = tree.getroot()
         for page in root.findall('page'):
             page_id = int(page.find('id').text)
             if page_id == doc_id:
                 log.log_info("Found target page")
                 title = page.find('title').text
                 text = page.find('revision/text').text.encode(
                     'ascii', 'ignore')
                 for c in text:
                     if c in self.unwanted_chars:
                         text = text.replace(c, ' ')
                 author = ""
                 if page.find('revision/contributor/username') is not None:
                     author = page.find(
                         'revision/contributor/username').text
                 elif page.find('revision/contributor/ip') is not None:
                     author = page.find('revision/contributor/ip').text
                 else:
                     author = "Author"
                 date = page.find('revision/timestamp').text
                 return {
                     'title': title,
                     'text': text,
                     'author': author,
                     'date': date
                 }
         log.log_error(
             "Cound not find document in corpus: {:}".format(doc_id))
         return {
             'title': "Title",
             'text': "Text",
             'author': "Author",
             'date': "Date"
         }
Exemple #6
0
def main(argv):
    log.flush_log()
    log.log_info("Hello")

    # process arguments
    document = None
    if '-d' in argv:
        document = argv[argv.index('-d') + 1]

    # launch orchestrator
    orchestrator = Orchestrator.start()
    if document:
        response = orchestrator.ask(
            msg.build_request(method='load_file', data={'file': document}))
        if response['status'] != 0:
            log.log_error(response['error_msg'])
        else:
            log.log_info("Loaded file")

    is_running = True
    while is_running:
        query = raw_input("Search: ")
        log.log_info("Query: {:}".format(query))
        if query == "q" or query == "quit":
            is_running = False
        else:
            response = orchestrator.ask(
                msg.build_request(method='search', data={'query': query}))
            if response['status'] == 0:
                # log.log_info("Found word: {:}".format(response['data']))
                print response['data']
            else:
                log.log_error(response['error_msg'])

    orchestrator.stop()
    log.log_info("Goodbye")