Example #1
0
    def __init__(self, result, rank):
        self.title = result.title
        self.url = utils.from_unicode_to_ascii(result.url)
        self.rank = rank
        self.description = utils.from_unicode_to_ascii(result.description)

        self.content = utils.from_unicode_to_ascii(self._get_content(result))

        # Split document into passages
        try:
            algorithm = MyConfig.get("document_segmentation", "algorithm")
            if algorithm == "lines":
                self.passages = SplitIntoLinesAlgorithm.split_into_passages(
                    self)
            elif algorithm == "paragraphs":
                self.passages = SplitIntoParagraphsAlgorithm.split_into_passages(
                    self)
            elif algorithm == "sentences":
                self.passages = SplitIntoSentencesAlgorithm.split_into_passages(
                    self)
            else:
                self.passages = SplitIntoParagraphsAlgorithm.split_into_passages(
                    self)
        except MyConfigException as e:
            logger = logging.getLogger("qa_logger")
            logger.warning(str(e))
            self.passages = SplitIntoParagraphsAlgorithm.split_into_passages(
                self)
Example #2
0
    def __init__(self, result, rank):
        self.title = result.title
        self.url = utils.from_unicode_to_ascii(result.url)
        self.rank = rank
        self.description = utils.from_unicode_to_ascii(result.description)

        self.content = utils.from_unicode_to_ascii(self._get_content(result))

        # Split document into passages
        try:
            algorithm = MyConfig.get("document_segmentation", "algorithm")
            if algorithm == "lines":
                self.passages = SplitIntoLinesAlgorithm.split_into_passages(self)
            elif algorithm == "paragraphs":
                self.passages = SplitIntoParagraphsAlgorithm.split_into_passages(self)
            elif algorithm == "sentences":
                self.passages = SplitIntoSentencesAlgorithm.split_into_passages(self)
            else:
                self.passages = SplitIntoParagraphsAlgorithm.split_into_passages(self)
        except MyConfigException as e:
            logger = logging.getLogger("qa_logger")
            logger.warning(str(e))
            self.passages = SplitIntoParagraphsAlgorithm.split_into_passages(self)
Example #3
0
    def _get_content(self, result):
        url = URL(result.url)

        try:
            timeout = int(MyConfig.get("document_retrieval", "timeout"))
        except MyConfigException as e:
            logger = logging.getLogger("qa_logger")
            logger.warning(str(e))
            timeout = 15

        try:
            mimetype = url.mimetype
            content = utils.from_unicode_to_ascii(url.download(timeout=timeout, unicode=True))
        except Exception as e:
            # If we cannot retrieve the document, we skip it
            logger = logging.getLogger("qa_logger")
            logger.warning("%s couldn't be retrieved", result.url)
            logger.warning(str(e))
            return ""

        return self._extract_text(content, mimetype)
Example #4
0
    def parse_questions(self, path):
        try:
            q_file = codecs.open(path, "r", encoding="utf-8", errors="ignore")
        except IOError:
            sys.exit("QA Error: bad argument")

        questions = []
        for line in q_file:
            # We use a regular expression for matching questions
            m = re.match(r"(?P<id>[^ \t]+)[ \t]*(?P<question>.+)", utils.from_unicode_to_ascii(line))
            id_q = m.group("id")
            q = m.group("question")
            questions.append(Question(id_q, q))

        try:
            q_file.close()
        except IOError:
            logger = logging.getLogger("qa_logger")
            logger.warning("Questions file not closed")

        return questions
Example #5
0
    def _pdf_to_plaintext(self, content):
        pdf_file = "tmp.pdf"

        f = open(pdf_file, "w")
        f.write(content)
        f.close()

        retrieval = StringIO()
        resource_manager = PDFResourceManager()
        layout_params = LAParams()
        encoding = "utf-8"
        device = TextConverter(resource_manager,
                               retrieval,
                               codec=encoding,
                               laparams=layout_params)

        f = file(pdf_file, "rb")
        try:
            process_pdf(resource_manager, device, f)
        except PDFTextExtractionNotAllowed:
            logger = logging.getLogger("qa_logger")
            logger.warning("pdf file couldn't be retrieved (no permissions?)")
            return ""
        except struct.error as e:
            logger = logging.getLogger("qa_logger")
            logger.warning("pdfminer internal error " + str(e))
            return ""
        except Exception as e:
            logger = logging.getLogger("qa_logger")
            logger.warning(str(e))
            return ""

        text = retrieval.getvalue()

        device.close()
        retrieval.close()
        f.close()
        os.remove(pdf_file)

        return utils.from_unicode_to_ascii(text)
Example #6
0
    def _get_content(self, result):
        url = URL(result.url)

        try:
            timeout = int(MyConfig.get("document_retrieval", "timeout"))
        except MyConfigException as e:
            logger = logging.getLogger("qa_logger")
            logger.warning(str(e))
            timeout = 15

        try:
            mimetype = url.mimetype
            content = utils.from_unicode_to_ascii(
                url.download(timeout=timeout, unicode=True))
        except Exception as e:
            # If we cannot retrieve the document, we skip it
            logger = logging.getLogger("qa_logger")
            logger.warning("%s couldn't be retrieved", result.url)
            logger.warning(str(e))
            return ""

        return self._extract_text(content, mimetype)
Example #7
0
File: QA.py Project: nrvnujd/qa
    def parse_questions(self, path):
        try:
            q_file = codecs.open(path, "r", encoding="utf-8", errors="ignore")
        except IOError:
            sys.exit("QA Error: bad argument")

        questions = []
        for line in q_file:
            # We use a regular expression for matching questions
            m = re.match(r"(?P<id>[^ \t]+)[ \t]*(?P<question>.+)",
                         utils.from_unicode_to_ascii(line))
            id_q = m.group("id")
            q = m.group("question")
            questions.append(Question(id_q, q))

        try:
            q_file.close()
        except IOError:
            logger = logging.getLogger("qa_logger")
            logger.warning("Questions file not closed")

        return questions
Example #8
0
    def _pdf_to_plaintext(self, content):
        pdf_file = "tmp.pdf"

        f = open(pdf_file, "w")
        f.write(content)
        f.close()

        retrieval = StringIO()
        resource_manager = PDFResourceManager()
        layout_params = LAParams()
        encoding = "utf-8"
        device = TextConverter(resource_manager, retrieval, codec=encoding, laparams=layout_params)

        f = file(pdf_file, "rb")
        try:
            process_pdf(resource_manager, device, f)
        except PDFTextExtractionNotAllowed:
            logger = logging.getLogger("qa_logger")
            logger.warning("pdf file couldn't be retrieved (no permissions?)")
            return ""
        except struct.error as e:
            logger = logging.getLogger("qa_logger")
            logger.warning("pdfminer internal error " + str(e))
            return ""
        except Exception as e:
            logger = logging.getLogger("qa_logger")
            logger.warning(str(e))
            return ""

        text = retrieval.getvalue()

        device.close()
        retrieval.close()
        f.close()
        os.remove(pdf_file)

        return utils.from_unicode_to_ascii(text)