Esempio n. 1
0
 def test_missing_filename_python(self):
     """Make sure missing files raise the correct error"""
     filename = self.get_temp_filename()
     os.remove(filename)
     import textract
     from textract.exceptions import MissingFileError
     with self.assertRaises(MissingFileError):
         textract.process(filename)
Esempio n. 2
0
 def test_unsupported_extension_python(self):
     """Make sure unsupported extension raises the correct error"""
     filename = self.get_temp_filename(extension="extension")
     import textract
     from textract.exceptions import ExtensionNotSupported
     with self.assertRaises(ExtensionNotSupported):
         textract.process(filename)
     os.remove(filename)
def annotate_doc(pdf_file_path, ontologies):
    if pdf_file_path.endswith('pdf') or pdf_file_path.endswith('PDF'):
        text = textract.process(pdf_file_path, method="pdfminer")
    elif pdf_file_path.endswith('html') or pdf_file_path.endswith('htm'):
        text = textract.process(pdf_file_path, method="beautifulsoup4")
    elif pdf_file_path.endswith('txt'):
            with open(pdf_file_path, 'r') as file:
                text = file.read()
    db = DBConnect()
    if text.isspace():
        log = {
            'file_name': pdf_file_path.encode('utf-8'),
            'error': 'Failed PDF to text transformation in annotation process',
            'exception': '',
            'data': ''
        }
        db.insert_log(log)
        return
    ontologies = ",".join(ontologies)
    annotations = []
    text = unidecode(text.decode('utf8'))
    text = ' '.join(text.split())
    # post_data = dict(apikey=settings.BIOPORTAL_API_KEY, text=text,
    #                  display_links='true', display_context='false', minimum_match_length='3',
    #                  exclude_numbers='true', longest_only='true', ontologies=ontologies, exclude_synonyms='true')
    post_data = dict(apikey=settings.BIOPORTAL_API_KEY, text=text,
                     display_links='true', display_context='false', minimum_match_length='3',
                     exclude_numbers='true', longest_only='true', ontologies=ontologies, exclude_synonyms='true')
    try:
        response = requests.post(settings.ANNOTATOR_URL, post_data)
        json_results = json.loads(response.text)
        for result in json_results:
            for annotation in result['annotations']:
                context_begin = annotation['from']  if annotation['from'] - 40 < 1 else annotation['from'] - 40
                context_end = annotation['to'] if annotation['to'] + 40 > len(text) else annotation['to'] + 40
                record = {
                    'file_name': pdf_file_path.encode('utf-8'),
                    'bio_class_id': result['annotatedClass']['@id'],
                    'bio_ontology_id': result['annotatedClass']['links']['ontology'],
                    'text': u'' + annotation['text'].encode('utf-8'),
                    'match_type': annotation['matchType'],
                    'context': u''+text[context_begin:context_end]
                }
                annotations.append(record)
        db.insert_annotations(annotations)
        return
    except (ValueError, IndexError, KeyError) as e:
        print e
        log = {
            'file_name': pdf_file_path.encode('utf-8'),
            'error': 'Bad response from Bioportal Annotator',
            'exception': str(e),
            'data': ''
        }
        db.insert_log(log)
        return
Esempio n. 4
0
def pdftotext_any(myfile):
    # Todo: use tempfile instead
    path = '/tmp/infile.pdf'
    with open(path, 'wb') as f:
    #with tempfile.NamedTemporaryFile() as f:
    #    path = f.name
        f.write(myfile)
    text = textract.process(path, method='pdftotext')
    if len(text)<5: # No text found, it is probably an image scan, so we need to do an OCR
        text = textract.process(path, method='tesseract')
    return text
Esempio n. 5
0
def build_indexes(files_list, index_file):
    toolbar_width = len(files_list)
    print(toolbar_width)
    sys.stdout.write("[%s]" % (" " * toolbar_width))
    sys.stdout.flush()
    sys.stdout.write("\b" * (toolbar_width+1)) # return to start of line, after '['
    hash_index = {}
    for item in files_list:
        text = textract.process(item)
        details = re.split("[, \t\n\\t:;]",  text)
        for i in details:
            if i == "" : continue
            if hash_index.has_key((i)) :
                if hash_index[(i)].has_key((item)):
                    hash_index[(i)][(item)] += 1
                else:
                    hash_index[(i)][(item)] = 1
            else:
                hash_index[(i)] = {}
                if hash_index[(i)].has_key(item):
                    hash_index[(i)][(item)] += 1
                else:
                    hash_index[(i)][(item)] = 1


        # update the bar
        sys.stdout.write("-")
        sys.stdout.flush()

    sys.stdout.write("\n")
    fp = open(index_file, "w")
    json.dump(hash_index, fp)
    fp.close()
Esempio n. 6
0
	def get_text_from_file(self, file):
		filename = file['id'] + '.pdf'

		self._download_file(file, filename)
		text = textract.process(filename)
		os.remove(filename)
		return text
Esempio n. 7
0
    def extract_text_from_lectureDocuments(self):
        # pull files from database
        lectureDocumentsObjects = lectureDocuments.objects.filter(extracted=False)

        # loop through modules and pull all text
        for lectureDocumentsObject in lectureDocumentsObjects:
            if lectureDocumentsObject.document:
                print lectureDocumentsObject.document
                path_to_file = MEDIA_ROOT + '/' + str(lectureDocumentsObject.document)
                document_contents = textract.process(path_to_file, encoding='ascii')

                # create tags from noun_phrases
                # only add tags if none exist
                blobbed = TextBlob(document_contents)
                np = blobbed.noun_phrases
                np = list(set(np))
                np = [s for s in np if s]
                lectureDocumentsObject.tags.clear()
                for item in np:
                    s = ''.join(ch for ch in item if ch not in exclude)
                    print s
                    lectureDocumentsObject.tags.add(s)

                # save this string
                lectureDocumentsObject.document_contents = document_contents
                lectureDocumentsObject.extracted = True
                lectureDocumentsObject.save()
def indexing():
    ana = analysis.StemmingAnalyzer()
    schema = Schema(title=TEXT(analyzer=ana, spelling=True), path=ID(stored=True), content=TEXT)
    ix = create_in("data/pdf_data", schema)
    writer = ix.writer()
    count = 0

    with open('Final_Links/doc_links.txt') as fp, open('data/pdf_data/mytemp/doc_content.txt', 'w+') as f:
        for line in fp:
            count += 1
            url = line
            doc_name = re.search('.*/(.*)', url).group(1)

            try:
                response = urllib2.urlopen(url, timeout=3)
                if int(response.headers['content-length']) > 2475248:
                    continue
                fil = open("data/pdf_data/mytemp/" + doc_name, 'w+')
                fil.write(response.read())
                fil.close()

                content_text = textract.process('data/pdf_data/mytemp/' + doc_name, encoding='ascii')
                f.write(content_text)
                writer.add_document(title=unicode(url, "utf-8"), path=unicode(url, "utf-8"),
                                    content=unicode(content_text))
                writer.add_document(title=unicode(url, "utf-8"), path=unicode(url, "utf-8"),
                                    content=unicode(url))
            except Exception as e:
                print "Caught exception e at " + '' + str(e)
                continue
            print str(count) + " in " + " URL:" + url

    writer.commit()
    print "Indexing Completed !"
Esempio n. 9
0
 def save(self, *args, **kwargs):
     super(Document, self).save(*args, **kwargs)
     text = textract.process(self.source_file.url)
     filtered_stems = self.get_filtered_stems(text)
     self.total_word_count = len(filtered_stems)
     self.count_target_words(filtered_stems)
     super(Document, self).save(*args, **kwargs)
Esempio n. 10
0
def extract(path):
    '''
    Extract full text fro pdf's

    :param path: [String] Path to a pdf file downloaded via {fetch}, or another way.

    :return: [str] a string of text

    Usage::

        from pyminer import miner

        # a pdf
        url = "http://www.banglajol.info/index.php/AJMBR/article/viewFile/25509/17126"
        out = miner.fetch(url)
        out.parse()

        # search first, then pass links to fetch
        res = miner.search(filter = {'has_full_text': True, 'license_url': "http://creativecommons.org/licenses/by/4.0"})
        # url = res.links_pdf()[0]
        url = 'http://www.nepjol.info/index.php/JSAN/article/viewFile/13527/10928'
        x = miner.fetch(url)
        miner.extract(x.path)
    '''
    text = textract.process(path)
    return text
Esempio n. 11
0
def process_text_file(file_path):
    file_name, extension = os.path.splitext(file_path)
    print file_name, extension
    if (extension == ".txt"):
        return file_path
    elif (extension == '.epub'):
        print "Trying epub"
        try:
            text = textract.process(file_path)
            print "Processed epub: ", file_path
            output_path = file_name+'.txt'
            output_file = open(output_path, 'w')
            output_file.write(text)
            print "Converted epub: ", output_path
            return output_path
        except Exception as error:
            # TODO: textract raises own error so none isn't returned on try failure
            print error
            print 'Failed to convert epub: ', file_path
            return None
    elif (extension == ""):
        text_content = None
        try:
            with open(file_path) as input_file:
                text_content = input_file.read()
                if text_content:
                    print "Managed to read file: ", file_path
                    return file_path
        except IOError:
            print "Failed to read file: ", file_path
            return None
    else:
        print 'Unsupported file type: ', file_path
        return None
Esempio n. 12
0
 def get_path_details(cls, temp_path, image_path):
     """Return the byte sequence and the full text for a given path."""
     byte_sequence = ByteSequence.from_path(temp_path)
     extension = map_mime_to_ext(byte_sequence.mime_type, cls.mime_map)
     logging.debug("Assessing MIME: %s EXTENSION %s SHA1:%s", byte_sequence.mime_type,
                   extension, byte_sequence.sha1)
     full_text = ""
     if extension is not None:
         try:
             logging.debug("Textract for SHA1 %s, extension map val %s",
                           byte_sequence.sha1, extension)
             full_text = process(temp_path, extension=extension, encoding='ascii',
                                 preserveLineBreaks=True)
         except ExtensionNotSupported as _:
             logging.exception("Textract extension not supported for ext %s", extension)
             logging.debug("Image path for file is %s, temp file at %s", image_path, temp_path)
             full_text = "N/A"
         except LookupError as _:
             logging.exception("Lookup error for encoding.")
             logging.debug("Image path for file is %s, temp file at %s", image_path, temp_path)
             full_text = "N/A"
         except UnicodeDecodeError as _:
             logging.exception("UnicodeDecodeError, problem with file encoding")
             logging.debug("Image path for file is %s, temp file at %s", image_path, temp_path)
             full_text = "N/A"
         except:
             logging.exception("Textract UNEXPECTEDLY failed for temp_file.")
             logging.debug("Image path for file is %s, temp file at %s", image_path, temp_path)
             full_text = "N/A"
     return byte_sequence, full_text
def get_text_from_files(files_to_process):
    """Extracts text from each file given a list of file_names"""
    file_text_dict = {}
    for file_name in iter(files_to_process):
        extracted_text = textract.process(file_name)
        file_text_dict[file_name] = extracted_text
    return file_text_dict
Esempio n. 14
0
 def test_standardized_text_python(self):
     """Make sure standardized text matches from python"""
     import textract
     result = textract.process(self.standardized_text_filename)
     self.assertEqual(
         ''.join(result.split()),
         self.get_standardized_text(),
     )
def parse_sentences(pdf):
	text = textract.process(pdf)

	reg = "[.?!]"

	sentences = re.split(reg, text)

	return [s for s in sentences if "\\x" not in s]
Esempio n. 16
0
    def compare_python_output(self, filename, expected_filename=None, **kwargs):
        if expected_filename is None:
            expected_filename = self.get_expected_filename(filename, **kwargs)

        import textract
        result = textract.process(filename, **kwargs)
        with open(expected_filename) as stream:
            self.assertEqual(result, stream.read())
def detectar(f):
    texto = textract.process(f)
    texto = texto.decode('utf-8')
    _texto = textblob.TextBlob(texto)
    try:
        lang = _texto.detect_language()
        return lang
    except TranslatorError:
        return None
Esempio n. 18
0
 def test_standardized_text_python(self):
     """Make sure standardized text matches from python"""
     import textract
     result = textract.process(self.standardized_text_filename)
     self.assertEqual(
         six.b('').join(result.split()),
         self.get_standardized_text(),
         "standardized text fails for %s" % self.extension,
     )
def get_recommendations_file(pdf_file_path):
        if pdf_file_path.endswith('pdf') or pdf_file_path.endswith('PDF'):
            text = textract.process(pdf_file_path, method="pdfminer")
        elif pdf_file_path.endswith('html') or pdf_file_path.endswith('htm'):
            text = textract.process(pdf_file_path, method="beautifulsoup4")
        elif pdf_file_path.endswith('txt'):
            with open(pdf_file_path, 'r') as file:
                text = file.read()
        if text.isspace():
            log = {
                'file_name': pdf_file_path.encode('utf-8'),
                'error': 'Failed PDF to text transformation in recommendation process',
                'exception': '',
                'data': ''
            }
            db = DBConnect()
            db.insert_log(log)
            return []
        abstract_index = text.find('abstract')
        abstract_index += text.find('ABSTRACT')
        abstract_index += text.find('Abstract')
        abstract_index = 0 if abstract_index < 0 else abstract_index
        text = unidecode(text.decode('utf8'))
        text = ' '.join(text.split())
        text = text[abstract_index:abstract_index+500] if len(text) > 500 else text
        post_data = dict(apikey=settings.BIOPORTAL_API_KEY, input=text, include='ontologies',
                         display_links='false', output_type='2', display_context='false',
                         wc='0.15', ws='1.0', wa='1.0', wd='0.5')
        try:
            response = requests.post(settings.RECOMMENDER_URL, post_data)
            json_results = json.loads(response.text)
            best_ontology_set = json_results[0]['ontologies'] if len(json_results) > 0 else []
            return [{'acronym': ontology['acronym'], 'id': ontology['@id']} for ontology in best_ontology_set]
        except (ValueError, IndexError, KeyError) as e:
            log = {
                'file_name': '',
                'error': 'Bad response from Bioportal Recommender:',
                'exception': str(e),
                'data': ''
            }
            db = DBConnect()
            db.insert_log(log)
            return []
Esempio n. 20
0
def quickQuotes(fileName):
	fileText=""
	try:
		fileText=textract.process(fileName)
	except textract.exceptions.ExtensionNotSupported:
		pass
	except Exception as e:
		print e
		pass
	return fileText
Esempio n. 21
0
 def extract_all(self, src, maxpages=0):
     if '.pdf' in src:
         try:
             start = time()
             text = self.extract(src, maxpages=maxpages)
             print "case 1 elapsed_time {}s".format(time() - start)
         except Exception, e:
             start = time()
             text = textract.process(src)
             print "case 2 elapsed_time {}s".format(time() - start)
Esempio n. 22
0
    def compare_python_output(self, filename, expected_filename=None, **kwargs):
        if expected_filename is None:
            expected_filename = self.get_expected_filename(filename, **kwargs)

        import textract

        result = textract.process(filename, **kwargs)
        with open(expected_filename, "rb") as stream:
            result = self.clean_text(result)
            expected = self.clean_text(stream.read())
            assert result == expected
            self.assertEqual(result, expected)
def source():
    # Create a list 'text' to store the words of source.pdf
    # text = textract.process(input("Enter File path (Enter path in double quotes): "))
    text = textract.process("source.pdf")
    text = text.split()

    # Gets user's difficulty rating
    # difficulty = input("Please enter difficulty rating (0 being most explanatory, 4 being least): ")
    difficulty = 0
    difficulty = int(difficulty)

    return (text, difficulty)
Esempio n. 24
0
	def get_text(self,file):
		if (not re.search(r'.pdf',file)):
			file = file + ".pdf"
		try:
			text = textract.process(file)
			text = text.strip()
			text = re.sub(b'\n+',b" ",text)
			text = re.sub(b'\s+',b" ",text)
			return text.decode("utf-8")
		except Exception as e:
			print("file: {} not found\ninformation from textract:\n\t{}".format(file,e))
			return 0
Esempio n. 25
0
def do_fetch_article(input_payload):
    logging.info("Fetching article from social post")
    result_payloads = []
    for link in input_payload["urls"]:
        url = link.get("expanded_url")
        display_url = link.get("display_url")
        shortened_url = link.get("url")

        file_name = download_file(url)

        text = textract.process(file_name)
        logging.info("Extracted article text ({} characters)".format(len(text)))

        metadata = {}
        try:
            metadata = extract_metadata(file_name)
        except:
            logging.exception("Failed to extract metadata from {}".format(url))

        delete_downloaded_file(file_name)
        logging.info("Deleted temp file: {}".format(file_name))

        result_payloads.append(
            {
                "contentType": "article-text",
                "key": url,
                "picture": get_favicon(url),
                "summary": {
                    "url": url,
                    "display_url": display_url,
                    "shortened_url": shortened_url,
                    "metadata": metadata
                },
                "raw": {
                    "text": text
                },
                "payload": {
                    "url": url,
                    "display_url": display_url,
                    "shortened_url": shortened_url,
                    "raw_text_size": len(text)
                },
                "placement_hints": {
                    "new_bldg": True,
                    "same_flr": False,
                    "flr_above": True,
                    "location_by_index": False,
                    "same_location": True,
                }
            }
        )
    return result_payloads
Esempio n. 26
0
 def  crawl(self):
     self.content = textract.process(self.document)
     
     if self.type == "txt":
         try:
             self.data = self.content.lower().split("\r\n\r\n")
             self.data = filter(partial(ne,""),self.data)
         except:
             self.data = self.content.lower().split("\n\n")
             self.data = filter(partial(ne,""),self.data)
     else:
         self. data = self.content.lower().split("\n\n")
         self.data = filter(partial(ne,""),self.data)
Esempio n. 27
0
    def _convert_rtf_to_text(self,index):

        input_pdf = self.cvFile

        inputPath = os.getcwd()
        if os.path.exists(input_pdf):
            inputPath = os.path.dirname(input_pdf)
        input_filename = os.path.basename(input_pdf)
        input_parts = input_filename.split(".")
        input_parts.pop()

        text = textract.process(input_pdf)
        return text
Esempio n. 28
0
def convert(uploaded_file):
    ext_format = uploaded_file.split('.')[-1]

    file_dir = path_to_files + uploaded_file

    data = textract.process(file_dir)

    if ext_format == 'xls':
        data = convert_from_xml(data)


    with open(path_to_result+'result.txt', "w") as text_file:
        text_file.write(data)
Esempio n. 29
0
	def get_features(**kwargs):

		directory = kwargs['directory']

		for file_path in RTFReader.get_file_list(directory, 'rtf'):
			try:
				features = RTFReader.get_meta_features(file_path=file_path)
				features['text'] = textract.process(file_path).decode('utf8')
				features['_texta_id'] = file_path

				yield features

			except Exception as e:
				HandleDatasetImportException(kwargs, e, file_path=file_path)
Esempio n. 30
0
def word_count(pdf_file_path):
    if pdf_file_path.endswith('pdf') or pdf_file_path.endswith('PDF'):
        text = textract.process(pdf_file_path, method="pdfminer")
    elif pdf_file_path.endswith('html') or pdf_file_path.endswith('htm'):
        text = textract.process(pdf_file_path, method="beautifulsoup4")
    elif pdf_file_path.endswith('txt'):
        with open(pdf_file_path, 'r') as file:
            text = file.read()
    if text.isspace():
            log = {
                'file_name': pdf_file_path.encode('utf-8'),
                'error': 'Failed PDF to text transformation in recommendation process',
                'exception': '',
                'data': ''
            }
            db = DBConnect()
            db.insert_log(log)
            return []
    text = unicode(text, 'utf-8')
    words = word_tokenize(text.upper())
    c = Counter()
    c.update(words)
    return c
Esempio n. 31
0
def parsePDF_textract(path):
    text = textract.process(path)
Esempio n. 32
0
        date = (input('Input Date (mm/dd/yyyy)\n'))
    return date


# In[20]:

dic = {}
cont = 1
n = 0
while cont == 1:
    input('Choose pdf to import \n (press enter when ready)')
    Tk().withdraw(
    )  # we don't want a full GUI, so keep the root window from appearing
    file = askopenfilename(
    )  # show an "Open" dialog box and return the path to the selected file
    text = textract.process(file)
    text = str(text)
    text = text.replace(':', '')
    text = text.replace('\\n', ' \n ')
    print(text)
    title = input('What is the Title of the Activity: \n ')
    os.system('clear')
    print(text)
    credit = find_credits(text)
    print(credit)
    os.system('clear')
    print(text)
    date = find_date(text)
    print(date)
    dic.update({title: [date, credit]})
    n = n + 1
Esempio n. 33
0
 def matches(self, path: Path) -> Any:
     content = textract.process(str(path))
     return self.expr.search(content.decode("utf-8"))
Esempio n. 34
0
    def get_site(url_string):
        """
        :param url_string: A url to be accessed
        :return: site data for a given urlString. Performs all necessary low level socket-http stuff
        """
        print("get_site(" + str(url_string) + ")")
        url = urlparse(url_string)
        return_value = {
            "url": url_string,
        }

        try:
            response = requests.get(url_string)
            if "text/html" in response.headers['content-type']:
                add_to_queue = []
                soup = BeautifulSoup(response.content, "html.parser")
                return_value["content_type"] = "html"
                return_value["text"] = soup.body.text
                print("body.text = " + return_value["text"])
                for link in soup.findAll('a'):
                    try:
                        href = link.get('href')
                        current_scheme_prefix = url.scheme + "://"
                        parsed_href = urlparse(href)
                        if not parsed_href.netloc:
                            href = url.netloc + href
                            parsed_href = urlparse(href)
                        if not parsed_href.scheme:
                            href = current_scheme_prefix + href
                            parsed_href = urlparse(href)
                        if ".edu.sg" in parsed_href.netloc or "moe.gov.sg" in parsed_href.netloc:
                            add_to_queue.append(href)
                    except Exception as e:
                        print("HTML Parse failed: ", e, flush=True)
                        pass
                return_value["urlqueue"] = add_to_queue
            elif "application/pdf" in response.headers['content-type']:
                fileurl = "./tmp.pdf"
                with open(fileurl, "wb") as f:
                    f.write(response.content)
                with open(fileurl, "rb") as f:
                    pdf_reader = PyPDF2.PdfFileReader(f)
                    num_pages = pdf_reader.numPages
                    count = 0
                    text = ""
                    # The while loop will read each page
                    while count < num_pages:
                        page_object = pdf_reader.getPage(count)
                        count += 1
                        text += page_object.extractText()
                    # This if statement exists to check if the above library returned #words. It's done because
                    # PyPDF2 cannot read scanned files.
                    if text != "":
                        text = text
                    # If the above returns as False, we run the OCR library textract to #convert scanned/image based
                    # PDF files into text
                    else:
                        text = textract.process(fileurl,
                                                method='tesseract',
                                                language='eng')
                    # Now we have a text variable which contains all the text derived #from our PDF file. Type print(
                    # text) to see what it contains. It #likely contains a lot of spaces, possibly junk such as '\n'
                    # etc. Now, we will clean our text variable, and return it as a list of keywords.
                    all_web_or_relative_urls_regex = r'(?:(?:http|https):\/\/|www\.|ftp\.)(?:\([-A-Z0-9+&@#\/%=~_|$?!:,' \
                                                     r'.]*\)|[-A-Z0-9+&@#\/%=~_|$?!:,.])*(?:\([-A-Z0-9+&@#\/%=~_|$?!:,' \
                                                     r'.]*\)|[A-Z0-9+&@#\/%=~_|$])'
                    urls_on_pdf = re.findall(all_web_or_relative_urls_regex,
                                             text,
                                             re.IGNORECASE | re.MULTILINE)
                    return_value["text"] = text
                    return_value["urlqueue"] = urls_on_pdf
                return_value["content_type"] = "pdf"
        except Exception as e:
            print("get_site failed: ", e, flush=True)
            return None
        return return_value
Esempio n. 35
0
                                    other_index = match[i]
                            experience_resume = ''
                            while (skill_index < other_index):
                                experience_resume += text[skill_index]
                                skill_index += 1
                            rest_name = experience_resume
            return rest_name
    else:
        return ("no experience detected")


data = {}
cvs = []
for doc in docs:
    cv = {}
    text = textract.process(doc).decode('utf-8')
    nlp = spacy.load('en_core_web_lg')
    doct = nlp(text.replace('\n\n', '\n'))
    cv['file'] = doc
    print(doc)
    cv['name'] = get_name(text, nlp)
    cv['email'] = get_email(text)
    cv['phone'] = get_phone(text)
    cv['location'] = get_location(text, doct)
    cv['language'] = get_language(text, doct)
    cv['skills'] = get_skills(text)
    cv['experience'] = get_experience(text)
    print("==============")
    cvs.append(cv)
data['data'] = cvs
with open('data.json', 'w') as f:
Esempio n. 36
0
found that \xe2\x80\x94
indicates empty cell

cmd line args: python pdfReader.py fileName rows cols
'''
args = sys.argv
PREFIX = args[5]
SPLIT = int(sys.argv[4])
ROW_NUM = int(args[2])
COL_NUM = int(args[3])
MAX_LENGTH = 16
###the right way to make a matrix using lists

matrix = [["-" for i in range(COL_NUM)] for j in range(ROW_NUM)]

text = textract.process(args[1])
text = str(text)
splits = text.split('\\r\\n')
column_count = 0
row_count = 0
for i in splits:
    if (row_count == ROW_NUM):
        break
    if (len(i) < MAX_LENGTH):
        if i == "\\xe2\\x80\\x94":

            matrix[row_count][column_count] = "-"
            column_count += 1
            if column_count >= COL_NUM:
                column_count = 0
                row_count += 1
Esempio n. 37
0
def getInfo2(strPath):
    import textract
    text = textract.process(strPath, method='pdfminer')
Esempio n. 38
0
def xls_pw(f):
    os.rename(f, f+'.xlsx')
    try:
        return process(f+'.xlsx').strip()
    except:
        return ''
Esempio n. 39
0
def doc_pw(f):
    os.rename(f, f+'.docx')
    try:
        return process(f+'.docx')
    except:
        return ''
Esempio n. 40
0
import sys
import json
from os import listdir
from os.path import isfile, join

import textract

if __name__ == "__main__":

    try:
        my_dir = sys.argv[1]
    except:
        my_dir = "raw_files/"

    print "Extracting text from:", my_dir

    file_list = [f for f in listdir(my_dir) if isfile(join(my_dir, f))]

    text = {}
    for f in file_list:
        print "\tProcessing file:", f
        text[f] = textract.process(join(my_dir, f), encoding="acsii")

    everything = {'input': text}

    with open("corpus_data.json", "w") as file_name:
        json.dump(everything, file_name)

    print "All Done for: ", text.keys()
Esempio n. 41
0
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)

num_pages = pdfReader.numPages
count = 0
text = ""

while count < num_pages:
    pageObj = pdfReader.getPage(count)
    count += 1
    text += pageObj.extractText()

if text != "":
    text = text

else:
    text = textract.process(fileurl, method='tesseract', language='eng')

tokens = word_tokenize(text)

punctuations = ['(', ')', ';', ':', '[', ']', ',']

stop_words = stopwords.words('english')
#Remove stop words and punctuation from the parse pdf
keywords = [
    word for word in tokens
    if not word in stop_words and not word in string.punctuation
]
#Make lowerCase from keywords
# lowerKeyword =[stripped.lower() for word in keywords]

print(keywords[:100])
#textract is installed in server. Veried by pip install textrect and message returned "Requirement already satisfied: pytz in /usr/lib/python2.7/site-packages (from tzlocal==1.5.1->extract-msg==0.23.1->textract) (2019.3)"

filepath = '/var/www/vhosts/tomedes.com/pro.tomedes.com/WordCount/REST/Input1/' + sys.argv[
    1]
filename = sys.argv[1]
filenamenoext = os.path.splitext(filename)[0]
# filepath = 'sample.pdf'

docfilename = '428f535ccf95770cdc0147ce7d2b01f0.doc'

# if docfilename[-4:] == ".doc":
#   filepath = './uploaded_files/428f535ccf95770cdc0147ce7d2b01f0.doc'
#   text = extract('./uploaded_files/428f535ccf95770cdc0147ce7d2b01f0.doc')
# else:
try:
    text = textract.process(filepath)
    # text = textract.process(str("sample.doc"))
    # text = textract.process("sample.doc").decode('utf-8')
    # text = textract.process("/var/www/vhosts/clone.tomedes.com/clone/wcnew/sample.doc")
    # print(text)
except:
    print('textract error')

# def extract(self, filepath):
#   print('extract')
#   stdout, stderr = self.run(['antiword', filepath])
#   return stdout

try:
    f = open(
        '/var/www/vhosts/tomedes.com/pro.tomedes.com/WordCount/REST/Input1/' +
Esempio n. 43
0
from tabula import read_pdf


# Only do once, then pickle for faster import
# cdcdf_full = pd.read_sas('/Users/alex/Documents/ML/cdc/data/LLCP2017.XPT',encoding='utf-8')

# Fix import rounding errors (e.g. 0.7e-80 instead of 0)
# cdcdf_full = cdcdf_full.round(5)

# pickle.dump(cdcdf_full, open('cdcdf_full.pickle', 'wb'))

# Import from saved pickle.dump
cdcdf_full = pickle.load( open( "cdcdf_full.pickle", "rb" ) )

# Get Variable Summary (varnames and question for each)
text = textract.process("information/codebook17_llcp-v2-508.pdf",
                       encoding='utf-8')
text = text.splitlines()
text = [i.decode('utf-8') for i in text]
v_and_q = pd.DataFrame({
    'Variable': [i.split(':')[1].strip() for i in text if 'SAS Variable Name' in i],
    'Question': [i.split(':')[1].strip() for i in text if 'Question:' in i]
})
v_and_q.drop_duplicates(inplace=True)
v_and_q.reset_index(drop=True,inplace=True)

# Make label dictionaries

# alltables = read_pdf('information/codebook17_llcp-v2-508.pdf',multiple_tables=True,
#                   pages='2-195')
# pickle.dump(alltables, open('alltables.pickle', 'wb'))
Esempio n. 44
0
import textract
import re
#Faccio una lista con le paroli presenti nel glossario
glossx = textract.process("Glossario.pdf", method='pdftotext')
glossario = textract.process("Glossario.pdf",
                             method='pdftotext').decode("utf-8")
index = glossario.index("Android") - 10
glossario = glossario[index:]
glosswords = re.findall(r'\n[\w -]{1,32}[^.]\r\n', glossario)
glosswords += re.findall(r'\n[\w]{1,10}[.][\w]{1,10}\r\n', glossario)
glosswords += re.findall(r'\n\x0c[\w -]{1,32}[^.]\r\n', glossario)
glosswords = list(dict.fromkeys(glosswords))
gloss = []
for glossword in glosswords:
    gloss.append(glossword.lower().strip())
#tutte le parole sono presenti nella lista gloss

docs = [
    "AnalisiDeiRequisiti.pdf", "PianoDiProgetto.pdf", "Glossario.pdf",
    "NormeDiProgetto.pdf", "StudioDiFattibilità.pdf", "PianoDiQualifica.pdf"
]
verbali = [
    "VI_2020_10_27.pdf", "VI_2020_11_10.pdf", "VI_2020_11_26.pdf",
    "VI_2020_12_14.pdf", "VI_2020_12_20.pdf", "VI_2021_01_07.pdf",
    "VE_2020_12_17.pdf"
]
scelta = input(
    "Scegli tra \n1: analizza tutti i documenti\n2: analizza tutti i verbali\n3: analizza verbali e documenti\n4: analizza un solo file\nScelta:"
)
parole = []
if scelta == "1":
Esempio n. 45
0
def parseCV2():
    try:
        if request.method == 'POST':
            f = request.files['file']
            f.save(UPLOAD_FOLDER + f.filename)

            data = ResumeParser(UPLOAD_FOLDER +
                                f.filename).get_extracted_data()
            text = textract.process(UPLOAD_FOLDER + f.filename)
            text = text.decode("utf-8")

            data['birth_date'] = extractDOB(text)
            data['marital_status'] = marital(text)
            data['nationality'] = nation(text)
            data['gender'] = gen(text)

            text = text.encode("utf-8").decode("ascii",
                                               "ignore").replace("\n",
                                                                 " ").strip()

            exp, edu = getExpEdu(text)
            stopwords = set(nltk.corpus.stopwords.words('english'))
            stopwords.update(['resume', 'curriculum', 'vitae'])

            filters = ':'
            translate_dict = dict((c, " ") for c in filters)
            translate_map = str.maketrans(translate_dict)
            text = text.translate(translate_map)

            text = ' '.join([w for w in text.split() if w not in stopwords])

            ent = en.extract_entity_sections_grad(text)

            if 'certifications' in ent.keys():
                a = ent['certifications']
                cert = " ".join(a)
                data['certifications'] = cert
            else:
                data['certifications'] = None

            pro = ['project', 'projects', 'project profile']
            if len([p for p in pro if p in ent.keys()]) > 0:
                a = ent[[p for p in pro if p in ent.keys()][0]]
                proj = " ".join(a)
                data['projects'] = proj
            else:
                data['projects'] = None

            if 'hobbies' in ent.keys():
                a = ent['hobbies']
                hob = " ".join(a)
                data['hobbies'] = hob
            else:
                data['hobbies'] = None

            if 'summary' in ent.keys():
                a = ent['summary']
                res = " ".join(a)
                data['summary'] = res
            else:
                data['summary'] = None

            if 'objective' in ent.keys():
                b = ent['objective']
                rest = " ".join(b)
                data['objective'] = rest
            else:
                data['objective'] = None

            # data["experience"] = [{
            #             "organization":None,
            #             "profile": None,
            #             "currentIndicator":None,
            #             "duration": {"start":None, "end":None}
            #             }]

            links = extractor.find_urls(text)
            if (len(links) > 0):
                for link in links:
                    if "github" in link:
                        data['github'] = link
                    else:
                        data['github'] = None

                for link in links:
                    if "linkedin" in link:
                        data['linkedin'] = link
                    else:
                        data['linkedin'] = None

                for link in links:
                    if "skype" in link:
                        data['skype'] = link
                    else:
                        data['skype'] = None
            else:
                data['linkedin'] = None
                data['github'] = None
                data['skype'] = None

            url = ['linkedin', 'github', 'skype']

            for ur in url:
                for i in range(len(links)):
                    if ur in links[i]:
                        links[i] = links[i].replace(links[i], "")
            links = list(filter(None, links))
            links = list(set(links))
            data['webpage'] = links

            data["skills"] = [i.upper() for i in data["skills"]]
            if ((isinstance(data["education"], list))
                    and (len(data["education"]) > 0)):
                data["qualification"] = []
                for ed in data["education"]:
                    if isinstance(ed, tuple):
                        data["qualification"].append({
                            "educationDegree": ed[0],
                            "year": ed[1],
                            "university": None,
                            "currentIndicator": None
                        })
                    else:
                        data["qualification"].append({
                            "educationDegree": ed,
                            "year": None,
                            "university": None,
                            "currentIndicator": None
                        })
            else:
                data['qualification'] = None

            print(data['qualification'])
            if edu[0]['educationDegree'] == None:
                data["education"] = data["qualification"]
            else:
                data["education"] = edu
            data["experience"] = exp
            # data["education"] = edu
            data["success"] = True

            data['first_name'] = None
            data['middle_name'] = None
            data['last_name'] = None

            if len(data['name'].split()) > 0:
                data['first_name'] = data['name'].split()[0]
                if len(data['name'].split()) > 2:
                    data['middle_name'] = data['name'].split()[1]
                    data['last_name'] = data['name'].split()[-1]
                elif len(data['name'].split()) == 2:
                    data['last_name'] = data['name'].split()[-1]
                else:
                    pass

            output_data = {
                "status": True,
                "message": "Cv Parsed Successfully",
                "inputFile": f.filename,
                "data": {
                    "objective": data['objective'],
                    "summary": data['summary'],
                    "personalInfo": {
                        "fullName": data["name"],
                        "firstName": data["first_name"],
                        "middleName": data['middle_name'],
                        "lastName": data["last_name"],
                        "maritialStatus": data['marital_status'],
                        "dateOfBirth": data['birth_date'],
                        "nationality": data['nationality'],
                        "gender": data['gender'],
                        "language": None,
                        "address": None,
                        "hobbies": data['hobbies'],
                        "passportNumber": None
                    },
                    "contactInfo": {
                        "email": data["email"],
                        "telephone": data["mobile_number"],
                        "currentLocation": None,
                        "webpage": data['webpage'],
                    },
                    "socials": {
                        "githubURL": data['github'],
                        "linkedinURL": data['linkedin'],
                        "skype": data['skype']
                    },
                    "education": data["education"],
                    "experience": data["experience"],
                    "skills": data["skills"],
                    "projects": {
                        "name": data['projects'],
                        "detail": None
                    },
                    "certification": {
                        "subject": data['certifications'],
                        "provider": None,
                    },
                    "publications": {
                        "title": None,
                        "publisher": None,
                        "monthYear": None
                    },
                    "achievements": {
                        "name": None,
                        "detail": None
                    }
                }
            }

            # output_data = { "objective": data['objective'],
            #                 "summary": data['summary'],
            #                 "personal_info": {
            #                                     "name": data["name"],
            #                                     "email": data["email"],
            #                                     "mobileNumber": data["mobile_number"],
            #                                     "githubURL": data['github'],
            #                                     "linkedinURL": data['linkedin'],
            #                                     "firstName": data["first_name"],
            #                                     "middleName": data['middle_name'],
            #                                     "lastName": data["last_name"],
            #                                     "maritialStatus" : data['marital_status'],
            #                                     "dateOfBirth" : data['birth_date'],
            #                                     "nationality" : data['nationality'],
            #                                     "gender" : data['gender'],
            #                                     "hobbies": data['hobbies'],
            #                                     "projects": data['projects'],
            #                                     "certifications":data['certifications'],

            #                                 },
            #                 "education": data["qualification"],
            #                 "experience": data["experience"],
            #                 "skills": data["skills"],
            #                 "success": True
            #                 }
            #os.remove(UPLOAD_FOLDER+f.filename)
            return jsonify(output_data)
        else:
            return jsonify({"success": False}), 400
    except Exception as e:
        return jsonify({"success": False}), 400
Esempio n. 46
0
def add(request):

    if (len(request.GET['search']) == 0):
        all_name = []
        if 'abstract' in request.GET:

            DATA_DIR = '/root/Django/file'
            pdf_name = []
            all_name.append("abstract")
            for filename in os.listdir(DATA_DIR):

                if (filename[-4:len(filename)] == ".pdf"):

                    pdf_name.append(filename.replace(".pdf", ""))

                    text = textract.process('/root/Django/file/' + filename,
                                            method='pdfminer')

                    file_txt = '/root/Django/file/abstract_all_' + filename.replace(
                        ".pdf", ".txt")

                    xml = open(file_txt, 'w')
                    for i in xrange(len(text)):
                        xml.write(text[i])
                    xml.close()

                    for i in xrange(len(pdf_name)):
                        myfile_x = open('/root/Django/file/abstract_all_' +
                                        pdf_name[i] + ".txt")
                        kk = 0
                        a1 = []
                        for j in myfile_x.readlines():
                            k = str(j.strip().replace(" ", ""))
                            if (abstract_start(k) or abstract_start(k[0:8])):
                                kk = 1
                            elif (abstract_end(k) or abstract_end(k[0:8])):
                                kk = 0
                            if kk == 1:
                                a1.append(str(j))
                        b1 = open(
                            '/root/Django/file/' + pdf_name[i] +
                            '_abstract.txt', 'w')
                        for i in xrange(len(a1)):
                            b1.write(a1[i])
                        b1.close()
        if 'introduction' in request.GET:

            DATA_DIR = '/root/Django/file'
            pdf_name = []
            all_name.append("introduction")
            for filename in os.listdir(DATA_DIR):

                if (filename[-4:len(filename)] == ".pdf"):

                    pdf_name.append(filename.replace(".pdf", ""))

                    text = textract.process('/root/Django/file/' + filename,
                                            method='pdfminer')

                    file_txt = '/root/Django/file/introduction_all_' + filename.replace(
                        ".pdf", ".txt")

                    xml = open(file_txt, 'w')
                    for i in xrange(len(text)):
                        xml.write(text[i])
                    xml.close()

                    for i in xrange(len(pdf_name)):
                        myfile_x = open('/root/Django/file/introduction_all_' +
                                        pdf_name[i] + ".txt")
                        kk = 0
                        a1 = []
                        for j in myfile_x.readlines():
                            k = str(j.strip().replace(" ", ""))
                            if (introduction_start(k)
                                    or introduction_start(k[0:8])):
                                kk = 1
                            elif (introduction_end(k)
                                  or introduction_end(k[0:8])):
                                kk = 0
                            if kk == 1:
                                a1.append(str(j))
                        b1 = open(
                            '/root/Django/file/' + pdf_name[i] +
                            '_introduction.txt', 'w')
                        for i in xrange(len(a1)):
                            b1.write(a1[i])
                        b1.close()
        if 'method' in request.GET:

            DATA_DIR = '/root/Django/file'
            pdf_name = []
            all_name.append("method")
            for filename in os.listdir(DATA_DIR):

                if (filename[-4:len(filename)] == ".pdf"):

                    pdf_name.append(filename.replace(".pdf", ""))

                    text = textract.process('/root/Django/file/' + filename,
                                            method='pdfminer')

                    file_txt = '/root/Django/file/method_all_' + filename.replace(
                        ".pdf", ".txt")

                    xml = open(file_txt, 'w')
                    for i in xrange(len(text)):
                        xml.write(text[i])
                    xml.close()

                    for i in xrange(len(pdf_name)):
                        myfile_x = open('/root/Django/file/method_all_' +
                                        pdf_name[i] + ".txt")
                        kk = 0
                        a1 = []
                        for j in myfile_x.readlines():
                            k = str(j.strip().replace(" ", ""))
                            if (method_start(k) or method_start(k[0:8])):
                                kk = 1
                            elif (method_end(k) or method_end(k[0:7])):
                                kk = 0
                            if kk == 1:
                                a1.append(str(j))
                        b1 = open(
                            '/root/Django/file/' + pdf_name[i] + '_method.txt',
                            'w')
                        for i in xrange(len(a1)):
                            b1.write(a1[i])
                        b1.close()
        if 'result' in request.GET:

            DATA_DIR = '/root/Django/file'
            pdf_name = []
            all_name.append("result")
            for filename in os.listdir(DATA_DIR):

                if (filename[-4:len(filename)] == ".pdf"):

                    pdf_name.append(filename.replace(".pdf", ""))

                    text = textract.process('/root/Django/file/' + filename,
                                            method='pdfminer')

                    file_txt = '/root/Django/file/result_all_' + filename.replace(
                        ".pdf", ".txt")

                    xml = open(file_txt, 'w')
                    for i in xrange(len(text)):
                        xml.write(text[i])
                    xml.close()

                    for i in xrange(len(pdf_name)):
                        myfile_x = open('/root/Django/file/result_all_' +
                                        pdf_name[i] + ".txt")
                        kk = 0
                        a1 = []
                        for j in myfile_x.readlines():
                            k = str(j.strip().replace(" ", ""))
                            if (result_start(k) or result_start(k[0:7])):
                                kk = 1
                            elif (result_end(k) or result_end(k[0:8])):
                                kk = 0
                            if kk == 1:
                                a1.append(str(j))
                        b1 = open(
                            '/root/Django/file/' + pdf_name[i] + '_result.txt',
                            'w')
                        for i in xrange(len(a1)):
                            b1.write(a1[i])
                        b1.close()
        if 'discussion' in request.GET:

            DATA_DIR = '/root/Django/file'
            pdf_name = []
            all_name.append("discussion")
            for filename in os.listdir(DATA_DIR):

                if (filename[-4:len(filename)] == ".pdf"):

                    pdf_name.append(filename.replace(".pdf", ""))

                    text = textract.process('/root/Django/file/' + filename,
                                            method='pdfminer')

                    file_txt = '/root/Django/file/discussion_all_' + filename.replace(
                        ".pdf", ".txt")

                    xml = open(file_txt, 'w')
                    for i in xrange(len(text)):
                        xml.write(text[i])
                    xml.close()

                    for i in xrange(len(pdf_name)):
                        myfile_x = open('/root/Django/file/discussion_all_' +
                                        pdf_name[i] + ".txt")
                        kk = 0
                        a1 = []
                        for j in myfile_x.readlines():
                            k = str(j.strip().replace(" ", ""))
                            if (discussion_start(k)
                                    or discussion_start(k[0:8])):
                                kk = 1
                            elif (discussion_end(k) or discussion_end(k[0:8])):
                                kk = 0
                            if kk == 1:
                                a1.append(str(j))
                        b1 = open(
                            '/root/Django/file/' + pdf_name[i] +
                            '_discussion.txt', 'w')
                        for i in xrange(len(a1)):
                            b1.write(a1[i])
                        b1.close()
        if 'reference' in request.GET:

            DATA_DIR = '/root/Django/file'
            pdf_name = []
            all_name.append("reference")
            for filename in os.listdir(DATA_DIR):

                if (filename[-4:len(filename)] == ".pdf"):

                    pdf_name.append(filename.replace(".pdf", ""))

                    text = textract.process('/root/Django/file/' + filename,
                                            method='pdfminer')

                    file_txt = '/root/Django/file/reference_all_' + filename.replace(
                        ".pdf", ".txt")

                    xml = open(file_txt, 'w')
                    for i in xrange(len(text)):
                        xml.write(text[i])
                    xml.close()

                    for i in xrange(len(pdf_name)):
                        myfile_x = open('/root/Django/file/reference_all_' +
                                        pdf_name[i] + ".txt")
                        kk = 0
                        a1 = []
                        for j in myfile_x.readlines():
                            k = str(j.strip().replace(" ", ""))
                            if (reference_start(k) or reference_start(k[0:8])):
                                kk = 1
                            if kk == 1:
                                a1.append(str(j))
                        b1 = open(
                            '/root/Django/file/' + pdf_name[i] +
                            '_reference.txt', 'w')

                        for i in xrange(len(a1)):
                            b1.write(a1[i])
                        b1.close()

        DATA_DIR = '/root/Django/file'
        pdf_name_all = []

        for filename in os.listdir(DATA_DIR):

            if (filename[-4:len(filename)] == ".pdf"):

                pdf_name_all.append(filename.replace(".pdf", ""))

        b1 = open('/root/Django/calc/templates/home3.html')
        b2 = open('/root/Django/calc/templates/home4.html', 'w')

        for k in b1.readlines():
            kk = str(k.strip().replace(" ", ""))
            if (kk == "start"):
                if ('abstract' in request.GET):
                    for j in xrange(len(pdf_name_all)):
                        b3 = open('/root/Django/file/' + pdf_name_all[j] +
                                  '_abstract.txt')
                        for jj in b3.readlines():
                            b2.write(str(jj))
                            b2.write("<br>")
                    b2.write("<br>--------------------------------------<br>")
                if ('introduction' in request.GET):
                    for j in xrange(len(pdf_name_all)):
                        b3 = open('/root/Django/file/' + pdf_name_all[j] +
                                  '_introduction.txt')
                        for jj in b3.readlines():
                            b2.write(str(jj))
                            b2.write("<br>")
                    b2.write("<br>--------------------------------------<br>")
                if ('method' in request.GET):
                    for j in xrange(len(pdf_name_all)):
                        b3 = open('/root/Django/file/' + pdf_name_all[j] +
                                  '_method.txt')
                        for jj in b3.readlines():
                            b2.write(str(jj))
                            b2.write("<br>")
                    b2.write("<br>--------------------------------------<br>")
                if ('result' in request.GET):
                    for j in xrange(len(pdf_name_all)):
                        b3 = open('/root/Django/file/' + pdf_name_all[j] +
                                  '_result.txt')
                        for jj in b3.readlines():
                            b2.write(str(jj))
                            b2.write("<br>")
                    b2.write("<br>--------------------------------------<br>")
                if ('discussion' in request.GET):
                    for j in xrange(len(pdf_name_all)):
                        b3 = open('/root/Django/file/' + pdf_name_all[j] +
                                  '_discussion.txt')
                        for jj in b3.readlines():
                            b2.write(str(jj))
                            b2.write("<br>")
                    b2.write("<br>--------------------------------------<br>")
                if ('reference' in request.GET):
                    for j in xrange(len(pdf_name_all)):
                        b3 = open('/root/Django/file/' + pdf_name_all[j] +
                                  '_reference.txt')
                        for jj in b3.readlines():
                            b2.write(str(jj))
                            b2.write("<br>")
                    b2.write("<br>--------------------------------------<br>")
                else:
                    b2.write(k)

        return render(request, 'home4.html')

    else:
        search = delect_special(request.GET['search']).split(" ")

        DATA_DIR = '/root/Django/file'
        pdf_name = []
        for filename in os.listdir(DATA_DIR):

            if (filename[-4:len(filename)] == ".pdf"):

                pdf_name.append(filename.replace(".pdf", ""))

                text = textract.process('/root/Django/file/' + filename,
                                        method='pdfminer')

                file_txt = '/root/Django/file/all_' + filename.replace(
                    ".pdf", ".txt")

                xml = open(file_txt, 'w')
                for i in xrange(len(text)):
                    xml.write(text[i])
                xml.close()

        a = []
        introduction = []
        method = []
        result = []
        discussion = []
        reference = []
        for i in xrange(len(pdf_name)):
            myfile_w = open('/root/Django/file/all_' + pdf_name[i] + '.txt',
                            'r')

            c = 0
            c1 = 0
            c2_1 = 0
            c2 = 0
            c3 = 0
            c4 = 0
            c3_1 = 0
            a1 = []
            introduction1 = []
            method1 = []
            result1 = []
            discussion1 = []
            reference1 = []
            for j in myfile_w.readlines():

                k = str(j.strip()).replace(" ", "")
                if (abstract_start(k) or abstract_start(k[0:8])):
                    c = 1
                elif (abstract_end(k) or abstract_end(k[0:8])):
                    c = 0
                if c == 1:
                    a1.append(j)

                if (introduction_start(k) or introduction_start(k[0:8])):
                    c1 += 1
                elif (introduction_end(k) or introduction_end(k[0:8])):
                    c1 = 0
                if c1 > 0:
                    introduction1.append(j)

                if (method_start(k) or method_start(k[0:8])):
                    c2 += 1
                elif (method_end(k) or method_end(k[0:7])):
                    c2 = 0
                if c2 > 0:
                    method1.append(j)

                if (result_start(k) or result_start(k[0:7])):
                    c3 += 1
                elif (result_end(k) or result_end(k[0:8])):
                    c3 = 0
                if c3 > 0:
                    result1.append(j)

                if (discussion_start(k) or discussion_start(k[0:8])):
                    c3_1 += 1
                elif (discussion_end(k) or discussion_end(k[0:8])):
                    c3_1 = 0
                if c3_1 > 0:
                    discussion1.append(j)

                if (reference_start(k) or reference_start(k[0:8])):
                    c4 += 1
                if c4 > 0:
                    reference1.append(j)

            a2 = ''.join(a1).split('.')
            a3 = []
            for x in xrange(len(a2)):
                a4 = 0
                for xx in xrange(len(search)):
                    if str(search[xx]).upper() in str(a2[x]).upper():
                        a4 += 1
                if a4 > 0:
                    a3.append(a2[x])

            introduction2 = ''.join(introduction1).split('.')
            introduction3 = []
            for x in xrange(len(introduction2)):
                introduction4 = 0
                for xx in xrange(len(search)):
                    if str(search[xx]).upper() in str(
                            introduction2[x]).upper():
                        introduction4 += 1
                if introduction4 > 0:
                    introduction3.append(introduction2[x])

            method2 = ''.join(method1).split('.')
            method3 = []
            for x in xrange(len(method2)):
                method4 = 0
                for xx in xrange(len(search)):
                    if str(search[xx]).upper() in str(method2[x]).upper():
                        method4 += 1
                if method4 > 0:
                    method3.append(method2[x])

            result2 = ''.join(result1).split('.')
            result3 = []
            for x in xrange(len(result2)):
                result4 = 0
                for xx in xrange(len(search)):
                    if str(search[xx]).upper() in str(result2[x]).upper():
                        result4 += 1
                if result4 > 0:
                    result3.append(result2[x])

            discussion2 = ''.join(discussion1).split('.')
            discussion3 = []
            for x in xrange(len(discussion2)):
                discussion4 = 0
                for xx in xrange(len(search)):
                    if str(search[xx]).upper() in str(discussion2[x]).upper():
                        discussion4 += 1
                if discussion4 > 0:
                    discussion3.append(discussion2[x])

            reference2 = ''.join(reference1).split('.')
            reference3 = []
            for x in xrange(len(reference2)):
                reference4 = 0
                for xx in xrange(len(search)):
                    if str(search[xx]).upper() in str(reference2[x]).upper():
                        reference4 += 1
                if reference4 > 0:
                    reference3.append(reference2[x])

            b1 = open('/root/Django/calc/templates/home3.html', 'r')
            b2 = open('/root/Django/calc/templates/' + pdf_name[i] + '.txt',
                      'w')
            for b3 in b1.readlines():

                b4 = str(b3.strip().replace(" ", ""))
                if (b4 == "start" and len(a3) > 0):

                    b2.write("@@@")
                    for xxx in xrange(len(a3)):
                        b2.write(str(a3[xxx]) + "_")

                    b2.write("!!!")
                    for xxx in xrange(len(introduction3)):
                        b2.write(str(introduction3[xxx]) + "_")

                    b2.write("!!!")
                    for xxx in xrange(len(method3)):
                        b2.write(str(method3[xxx]) + "_")

                    b2.write("!!!")
                    for xxx in xrange(len(result3)):
                        b2.write(str(result3[xxx]) + "_")

                    b2.write("!!!")
                    for xxx in xrange(len(discussion3)):
                        b2.write(str(discussion3[xxx]) + "_")

                    b2.write("!!!")
                    for xxx in xrange(len(reference3)):
                        b2.write(str(reference3[xxx]) + "_")

                    b2.write("@@@")
                # elif(b4=="<!--introduction-->" and len(introduction3)>0):

                # for xxx in xrange(len(introduction3)):
                #  b2.write(str(introduction3[xxx])+"_")
                # elif(b4=="<!--method-->" and len(method3)>0):
                #   b2.write("method<br><br>")
                #   for xxx in xrange(len(method3)):
                #    b2.write(str(method3[xxx])+"-------------------<br>")
                # elif(b4=="<!--result-->" and len(result3)>0):
                #   b2.write("result<br><br>")
                #   for xxx in xrange(len(result3)):
                #    b2.write(str(result3[xxx])+"-------------------<br>")
                # elif(b4=="<!--discussion-->" and len(discussion3)>0):
                #   b2.write("discussion<br><br>")
                #   for xxx in xrange(len(discussion3)):
                #    b2.write(str(discussion3[xxx])+"-------------------<br>")
                # elif(b4=="<!--reference-->" and len(reference3)>0):
                #   b2.write("reference<br><br>")
                #   for xxx in xrange(len(reference3)):
                #    b2.write(str(reference3[xxx])+"-------------------<br>")
                #else:
                #  b2.write(b3)
        DATA_DIR = '/root/Django/calc/templates'
        b1 = open('/root/Django/calc/templates/home3.html', 'r')
        b2 = open('/root/Django/calc/templates/home4.html', 'w')
        for b3 in b1.readlines():
            b4 = str(b3.strip().replace(" ", ""))
            if (b4 == "start"):
                for filename in os.listdir(DATA_DIR):
                    arr = []
                    if (filename[-4:len(filename)] == ".txt"):

                        f1 = open('/root/Django/calc/templates/' + filename,
                                  'r')
                        for f2 in f1.readlines():
                            arr.append(f2)

                        ext = "".join(arr).replace("<", "").replace(
                            ">", "").replace("\'", "")
                        ext_sp = ext.split('@@@')
                        for f3 in range(1, len(ext_sp)):
                            ext_f3 = ext_sp[f3].split('!!!')

                            aar = [
                                'Abstract', 'Introduction', 'Method', 'Result',
                                'Discussion', 'Reference'
                            ]
                            for f4 in xrange(len(ext_f3)):
                                b2.write(str(aar[f4]) + "<br><br>")
                                f5 = ext_f3[f4].split('_')
                                for f6 in xrange(len(f5)):

                                    b2.write(str(f5[f6]) + "<br><br>")

                        b2.write("----------------------<br><br>")
            else:
                b2.write(b3)
        return render(request, 'home4.html')
Esempio n. 47
0
punctuations = ['(',')',';',':','[',']',',','The','the']#inspite of that the keeps popping up
def inspector(stg):
   if len(stg)<3:#filters out many other unwanted things
      return False
   if stg.isalpha()== False:#filters out alpha numeric and numeric strings
      return False
   if stg in punctuations:
      return False
   if stg in stop_words:
      return False
   if stg == '.':#filters out stops(added measure)
      return False
   return True

filename = "JavaBasics-notes.pdf" #whatever file you want to scan
text = textract.process(filename, method='tesseract', encoding='ascii')
if text != "":
   text = text

#If the above returns as False, we run the OCR library textract to #convert scanned/image based PDF files into text

else:
   text = textract.process(filename, method='tesseract', language='eng', encoding='ascii')
   
#The word_tokenize() function will break our text phrases into individual words

newt = text.decode("ascii")
tokens = word_tokenize(newt)

#the keywords
Esempio n. 48
0
    def parse_norm(self, response):
        meta_date = self.extract_with_css(
            response, 'span.meta-date::text').extract_first()
        today = date.today().strftime('%Y-%m-%d')

        # print(meta_date)
        def date_from_en_to_es(m):
            split = m.split()

            def translate(arg):
                arg = arg.lower()
                if arg == 'enero':
                    return 'jan'
                elif arg == 'febrero':
                    return 'feb'
                elif arg == 'marzo':
                    return 'mar'
                elif arg == 'abril':
                    return 'apr'
                elif arg == 'mayo':
                    return 'may'
                elif arg == 'junio':
                    return 'jun'
                elif arg == 'julio':
                    return 'jul'
                elif arg == 'agosto':
                    return 'aug'
                elif (arg == 'septiembre') | (arg == 'setiembre'):
                    return 'sep'
                elif arg == 'octubre':
                    return 'oct'
                elif arg == 'noviembre':
                    return 'nov'
                elif arg == 'diciembre':
                    return 'dec'
                else:
                    return 'None'

            split[0] = translate(split[0])
            date = ' '.join(split)
            return date

        meta_date = parser.parse(date_from_en_to_es(meta_date))
        meta_date = meta_date.strftime('%Y-%m-%d')
        # print(meta_date)

        if meta_date == today:
            # crawl new norm
            # print('Entered parse_norm')
            type = self.extract_with_css(
                response,
                'div.main-content h1.entry-title::text').extract_first()
            pdf_link = self.extract_with_css(response,
                                             'p.embed_download a::attr(href)')

            if len(pdf_link) == 1:
                # extract text from PDF
                # print('\nExtract text from PDF...')
                res_name = os.getenv(
                    'NORMATIVES_MUNICIPAL_PATH'
                ) + '/datasets/pdf/' + response.meta['link'].rsplit(
                    '/', 2)[-2] + '.pdf'
                # print('res_name', res_name)
                pdf_name = pdf_link.extract_first()
                pdf_name = iri_to_uri(pdf_name)
                # print('pdf_name', pdf_name)
                urllib.request.urlretrieve(pdf_name, res_name)
                text = textract.process(res_name).decode("utf-8")
                # print('Done!\n')

            else:
                # extract plain-text
                # print('\nExtract text from HTML...')
                html = self.extract_with_css(
                    response, 'div.main-content').extract_first()
                soup = BeautifulSoup(html, 'html.parser')
                text = soup.get_text()
                # print('Done!\n')

            yield Norm({
                'published_at': meta_date,
                'type': dict(full=type),
                'text': text,
                'link': response.meta['link'],
                'html': response.text
            })
Esempio n. 49
0
def get_definitions_dict(input_file):
    '''
    The input PDF file is scraped with the help of the textract library for image recognition. 
    This is done by analyzing the content of the page when formatted as text. The definitions are in capital
    and appear in the header of the text, so these are initialized as keys of the definitions dict. Then, 
    the first sentences to include the definitions are initialized as the value in the definitions dict. 
    Sentences are recognized by the use of full stops, with common abbreviations that include full stops excluded. 

    Some helper methods are present to do simple Natural Language Processing to include prior or subsequent sentences, 
    on top of the first sentence that mentions the definition, in the definition value.  
    '''

    #Read the input file
    pdf_reader = PdfFileReader(open(input_file, "rb"))

    #The final definitions dictionary
    definitions_dict = {}

    for page_counter in range(
            pdf_reader.getNumPages()):  #Loop through each page
        pdf_writer = PdfFileWriter()
        pdf_writer.addPage(pdf_reader.getPage(page_counter))
        with open('{0}.pdf'.format(page_counter), 'wb') as f:
            pdf_writer.write(f)
            f.close()
        text = textract.process(
            '{0}.pdf'.format(page_counter)
        )  #Use textract's image recognition to convert the pdf to text
        text = text.decode("utf8")

        lines = list(filter(lambda x: x != '',
                            text.split('\n')))  #Split text into lines

        passed_page_number = 0
        regex = re.compile('.\..')  #Split text into sentences

        for line in lines:
            if line.isupper():
                definitions_dict[line] = ''
        abbreviations_set = {
            'vs', 'etc', 'est', 'bc'
        }  #common abbreviations to ignore when deciding sentences

        sentences = list(filter(lambda x: x != '', text.split('.')))

        for sentence_index in range(
                len(sentences) - 1
        ):  #for each sentence, check if definition to the word is in the sentence. If so, update the definitions dictionary
            sentence = sentences[sentence_index]
            words = sentence.split(' ')
            if words[-1] in abbreviations_set or len(words) < 5:
                sentences[sentence_index +
                          1] = sentence + sentences[sentence_index + 1]
            else:
                for word in words:
                    word = re.sub(r'[^\w\s]', '', word)
                    if word.upper() in definitions_dict:
                        if definitions_dict[word.upper()] == '':
                            if ('Thus' in sentence or 'Because'
                                    in sentence) and sentence_index > 0:
                                sentence = sentences[sentence_index -
                                                     1] + sentence
                            definitions_dict[word.upper()] = sentence.replace(
                                '\n', ' ')

    csv_data = []  #save the definitions dict as a csv
    for value in definitions_dict:
        csv_data.append([value, definitions_dict[value]])
    with open("definitions.csv", "wt") as fp:
        writer = csv.writer(fp, delimiter=",")
        writer.writerows(csv_data)

    for file in os.listdir("."):
        if os.path.isfile(file) and file.endswith(".pdf"):
            os.remove(file)
Esempio n. 50
0
    def get_docs(self):
        if bulk_collect_location_policy.is_allowed(self.path) is False:
            raise ValueError('Bulk collect path is illegal ' + self.path)

        source = self.sources[0]

        host = source['host']
        start_path = source['start_path']
        target_element = source['target_element']
        render_type = source['render_type']

        # find the 2nd nested tbody.

        folder_name = self.country.replace(' ', '-').lower()
        root_path = self.path + '/' + folder_name

        page_url = host + start_path
        results_response = requests.request('GET', page_url)
        results_html = results_response.content
        results_soup = BeautifulSoup(results_html, 'html.parser')

        tables = results_soup.find_all('table')
        results_table_index = 7
        results_table = tables[results_table_index]

        paragraphs = results_table.find_all('p')
        for i in range(0, len(paragraphs)-1, 2):
            p = paragraphs[i]
            p_next = paragraphs[i+1]

            if ("Press Release" in p.get_text()) == False:
                continue

            date_str = p.get_text().split(' - ')[0].strip()
            tmp = dateparser.parse(date_str)
            date = datetime.date(tmp.year, tmp.month, tmp.day)

            if gdpr_retention_specification.is_satisfied_by(date) is False:
                continue # try another result_link

            document_folder = p.get_text()
            document_folder_md5 = hashlib.md5(document_folder.encode()).hexdigest()

            language_code = 'en'

            document_link = links_from_soup_service(p_next)[0]
            document_url = document_link[1]
            document_response = requests.request('GET', document_url)
            document_content = document_response.content

            dirpath = root_path + '/' + document_folder_md5
            try:
                os.makedirs(dirpath)
            except FileExistsError:
                print('Directory path already exists, continue.')

            document_word_path = dirpath + '/' + language_code + '.doc'

            with open(document_word_path, 'wb') as f:
                f.write(document_content)

            document_text = textract.process(document_word_path)
            with open(dirpath + '/' + language_code + '.txt', 'wb') as f:
                f.write(document_text)

        return True
Esempio n. 51
0
def add_local_definitions(input_file, page_counter, intermediate_file_name,
                          definitions_dict):
    '''
    This function adds a definition sidebar to each page. It does this by parsing the text of the page, finding terms
    that exist in the definitions dictionary that is provided as an argument and returning a new page with both the input
    page and the definitions sidebar 
    '''

    #Get the input page and dimensions of the input page
    input1 = PdfFileReader(open(input_file, 'rb')).getPage(page_counter)
    page_length = input1.mediaBox[3]
    page_width = input1.mediaBox[2]
    page_width = float(page_width)

    #Create the background canvas object that will hold both the page and the definitions sidebar.
    background_canvas = Canvas("background.pdf",
                               pagesize=(page_width * 1.5, page_length))
    background_canvas.setFont("Times-Roman", 12)
    background_canvas.setFillColor(white)
    background_canvas.drawString(1 * inch, 10 * inch, "White text")
    background_canvas.save()

    #Attach the original page to the left of the background canvas
    with open("background.pdf", "rb") as inFile, open(input_file,
                                                      "rb") as overlay:
        original = pypdf.PdfFileReader(inFile)
        background = original.getPage(0)
        foreground = pypdf.PdfFileReader(overlay).getPage(page_counter)
        background.mergePage(foreground)
        writer = pypdf.PdfFileWriter()
        for i in range(original.getNumPages()):
            page = original.getPage(i)
            writer.addPage(page)

        with open("modified1.pdf", "wb") as outFile:
            writer.write(outFile)

    #Convert page to text
    text = textract.process("modified1.pdf")
    text = text.decode("utf8")

    abbreviations_set = {'vs', 'etc', 'est', 'bc'}

    local_dict = {}

    text = ''.join(text.splitlines())

    #For each phrase of 1-3 words in the text, check if the same phrase is in the dictionary. If so, attach the definition and phrase to the local dict object
    sentences = list(filter(lambda x: x != '', text.split('.')))
    words_queue = collections.deque(3 * ['0'], 3)
    for sentence_index in range(len(sentences) - 1):
        sentence = sentences[sentence_index]
        words = sentence.split(' ')
        for word in words:
            if word != '':
                words_queue.append(word)

            for word_sample in [
                    words_queue[0].upper(),
                (words_queue[0] + ' ' + words_queue[1]).upper(),
                (words_queue[0] + ' ' + words_queue[1] + ' ' +
                 words_queue[2]).upper()
            ]:
                if word_sample in definitions_dict:
                    local_dict[word_sample] = definitions_dict[word_sample]

    #Create the definitions sidebar, and set relevant properties.
    insert_canvas = Canvas("insert.pdf",
                           pagesize=(page_width * 0.5, page_length))
    insert_canvas.setFillColor(HexColor("#D3D3D3"))
    insert_canvas.rect(5, 5, page_width * 0.5, page_length, fill=1)
    insert_canvas.setFillColor(black)
    insert_canvas.setFont('Times-Bold', 16)
    insert_canvas.drawString(page_width * 0.16, page_length - 0.25 * inch,
                             "DEFINITIONS")
    lines = 4

    #Write the dictionary of definitions used into the definitions sidebar
    for item in local_dict:
        insert_canvas.setFont("Times-Roman", 12)

        textobject = insert_canvas.beginText(
            10, page_length - (0.17 * inch * lines))
        my_text = f"{item} : {local_dict[item]}"
        my_text = textwrap.fill(my_text, 52) + '\n'
        for line in my_text.splitlines(False):
            textobject.textLine(line.rstrip())
        insert_canvas.drawText(textobject)
        lines += my_text.count('\n') + 2
    insert_canvas.save()

    #Combine the current page on the background canvas and the definitions sidebar
    with open("modified1.pdf", "rb") as inFile, open("insert.pdf",
                                                     "rb") as overlay:
        original = pypdf.PdfFileReader(inFile)
        background = original.getPage(0)
        foreground = pypdf.PdfFileReader(overlay).getPage(0)

        # merge the first two pages
        background.mergeTranslatedPage(foreground, 620, 0)

        # add all pages to a writer
        writer = pypdf.PdfFileWriter()
        for i in range(original.getNumPages()):
            page = original.getPage(i)
            writer.addPage(page)

        final_name = 'processed_' + intermediate_file_name
        # write everything in the writer to a file
        with open(final_name, "wb") as outFile:
            writer.write(outFile)

    #Delete all intermediate files
    os.remove('background.pdf')
    os.remove('modified1.pdf')
    os.remove('insert.pdf')
    os.remove(intermediate_file_name)
Esempio n. 52
0
def text_for_epub(path):
    return textract.process(path).decode('utf-8')
    # classify file type and remove metadata
    clean_fnames.pop(0)
    clean_fnames.pop(0)
    data = []
    pat1 = re.compile(r"% \S+")
    pat2 = re.compile(r"\n+")
    i = 0

    # For each sermon
    for fname in clean_fnames:
        print("file {}".format(i))
        i += 1
        filetype = fname.split(".")[-1]
        # Select correct filetype
        if filetype == "doc":
            text = textract.process(fname).decode('utf-8')
        elif filetype == "docx":
            text = docx2txt.process(fname)
        else:
            print(fname)  # TODO read odt
        # remove metadata
        text = pat1.sub("", text)
        text = pat2.sub("\n", text)
        #append to dataframe
        data.append([fname.split("/")[-1].split(".")[0], text])

    # save df, keeping only sermon_ID and text content
    df = pd.DataFrame(data)
    df.columns = ["id", "content"]
    outpath = os.path.join("data", "content.dat")
    df.to_csv(outpath, encoding='utf-8')
Esempio n. 54
0
def res(jobfile):
    Final_Array = []

    def lcs(X, Y):
        try:
            mat = []
            for i in range(0, len(X)):
                row = []
                for j in range(0, len(Y)):
                    if X[i] == Y[j]:
                        if i == 0 or j == 0:
                            row.append(1)
                        else:
                            val = 1 + int(mat[i - 1][j - 1])
                            row.append(val)
                    else:
                        row.append(0)
                mat.append(row)
            new_mat = []
            for r in mat:
                r.sort()
                r.reverse()
                new_mat.append(r)
            lcs = 0
            for r in new_mat:
                if lcs < r[0]:
                    lcs = r[0]
            return lcs
        except:
            return -9999

    def spellCorrect(string):
        words = string.split(" ")
        correctWords = []
        for i in words:
            correctWords.append(spell(i))
        return " ".join(correctWords)

    def semanticSearch(searchString, searchSentencesList):
        result = None
        searchString = spellCorrect(searchString)
        bestScore = 0
        for i in searchSentencesList:
            score = lcs(searchString, i)
            print(score, i[0:100])
            print("")
            temp = [score]
            Final_Array.extend(temp)
            if score > bestScore:
                bestScore = score
                result = i
        return result

    app.config['UPLOAD_FOLDER'] = 'Original_Resumes/'
    app.config['ALLOWED_EXTENSIONS'] = set(
        ['txt', 'pdf', 'png', 'jpg', 'jpeg', 'gif'])

    def allowed_file(filename):
        return '.' in filename and \
            filename.rsplit('.', 1)[1] in app.config['ALLOWED_EXTENSIONS']

    Resume_Vector = []
    Ordered_list_Resume = []
    Ordered_list_Resume_Score = []
    LIST_OF_FILES = []
    LIST_OF_FILES_PDF = []
    LIST_OF_FILES_DOC = []
    LIST_OF_FILES_DOCX = []
    Resumes_File_Names = []
    Resumes = []
    Temp_pdf = ''
    os.chdir('./Original_Resumes')
    for file in glob.glob('**/*.pdf', recursive=True):
        LIST_OF_FILES_PDF.append(file)
    for file in glob.glob('**/*.doc', recursive=True):
        LIST_OF_FILES_DOC.append(file)
    for file in glob.glob('**/*.docx', recursive=True):
        LIST_OF_FILES_DOCX.append(file)

    LIST_OF_FILES = LIST_OF_FILES_DOC + LIST_OF_FILES_DOCX + LIST_OF_FILES_PDF
    # LIST_OF_FILES.remove("antiword.exe")
    print("This is LIST OF FILES")
    print(LIST_OF_FILES)

    # print("Total Files to Parse\t" , len(LIST_OF_PDF_FILES))
    print("####### PARSING ########")
    for nooo, i in enumerate(LIST_OF_FILES):
        Ordered_list_Resume.append(i)
        Temp = i.split(".")
        if Temp[1] == "pdf" or Temp[1] == "Pdf" or Temp[1] == "PDF":
            try:
                print("This is PDF", nooo)
                with open(i, 'rb') as pdf_file:
                    read_pdf = PyPDF2.PdfFileReader(pdf_file)
                    # page = read_pdf.getPage(0)
                    # page_content = page.extractText()
                    # Resumes.extend(Temp_pdf)

                    number_of_pages = read_pdf.getNumPages()
                    for page_number in range(number_of_pages):

                        page = read_pdf.getPage(page_number)
                        page_content = page.extractText()
                        page_content = page_content.replace('\n', ' ')
                        # page_content.replace("\r", "")
                        Temp_pdf = Temp_pdf + str(page_content)
                        # Temp_pdf.append(page_content)
                        # print(Temp_pdf)
                    Resumes.extend([Temp_pdf])
                    Temp_pdf = ''
                    Resumes_File_Names.append(i)
                    # f = open(str(i)+str("+") , 'w')
                    # f.write(page_content)
                    # f.close()
            except Exception as e:
                print(e)
        if Temp[1] == "doc" or Temp[1] == "Doc" or Temp[1] == "DOC":
            print("This is DOC", i)

            try:
                a = textract.process(i)
                a = a.replace(b'\n', b' ')
                a = a.replace(b'\r', b' ')
                b = str(a)
                c = [b]
                Resumes.extend(c)
                Resumes_File_Names.append(i)
            except Exception as e:
                print(e)

        if Temp[1] == "docx" or Temp[1] == "Docx" or Temp[1] == "DOCX":
            print("This is DOCX", i)
            try:
                a = textract.process(i)
                a = a.replace(b'\n', b' ')
                a = a.replace(b'\r', b' ')
                b = str(a)
                c = [b]
                Resumes.extend(c)
                Resumes_File_Names.append(i)
            except Exception as e:
                print(e)
        # Resumes.extend(textract.process(i))
        if Temp[1] == "ex" or Temp[1] == "Exe" or Temp[1] == "EXE":
            # print("This is EXE" , i)
            pass

    # print("This is length of Resume Vector : " , len(Resumes))
    # # # print(Resumes[1][0:10])
    # for m , i in enumerate(Resumes):
    #     print("This is m : " , m , i[0][0:100])
    #     print("#######################################################################")

    for m, i in enumerate(Resumes):
        Resumes[m] = nltk.word_tokenize(Resumes[m])
        Resumes[m] = normalize(Resumes[m])
        Resumes[m] = ' '.join(map(str, Resumes[m]))

    jobfile = nltk.word_tokenize(jobfile)
    jobfile = normalize(jobfile)
    jobfile = ' '.join(map(str, jobfile))
    # Resumes2 = np.array(Resumes)

    # Resumes2 = Resumes2.ravel()

    # print(len(Resumes))

    # Resumes = ['microsoft is dumb' , 'google is awesome' , 'facebook is cheater']
    print("This is len Resumes : ", len(Resumes))
    os.chdir('../')

    print("#############################################################")
    # a = input("Enter String to Search : ")
    print("\n\n")
    print("Printing Scores of all Resumes...")
    print("\n")
    result = semanticSearch(jobfile, Resumes)
    print("\n")
    print("Printing 1 Best Result.....")
    print("\n")
    print(result)
    print("\n\n")
    print("#########################################################")
    print("#########################################################")
    print("#########################################################")
    print("#########################################################")
    print("\n\n")
    print(Final_Array)
    print("This is len Final_Array : ", len(Final_Array))
    print(Resumes_File_Names)
    print("This is len Ordered_list_Resume : ", len(Resumes_File_Names))
    Ordered_list_Resume = Ordered_list_Resume[1:]
    # print(Ordered_list_Resume)

    Z = [
        x
        for _, x in sorted(zip(Final_Array, Resumes_File_Names), reverse=True)
    ]
    flask_return = []
    # for n,i in enumerate(Z):
    #     print("Rankkkkk\t" , n+1, ":\t" , i)

    for n, i in enumerate(Z):
        # print("Rank\t" , n+1, ":\t" , i)
        # flask_return.append(str("Rank\t" , n+1, ":\t" , i))
        name = getfilepath(i)
        #name = name.split('.')[0]
        rank = n
        res = ResultElement(rank, name)
        flask_return.append(res)
        # res.printresult()
        # print(f"Rank{res.rank+1} :\t {res.filename}")
    return flask_return
Esempio n. 55
0
def pdf_to_text_textract(pdf_file_path):
    page_text = textract.process(pdf_file_path)  #, encoding='ascii'
    return page_text
Esempio n. 56
0
def LoadDoc(path):
    words = textract.process(path)
    words = words.decode('utf-8')
    return words
Esempio n. 57
0
def readWord(path_to_Doc):
    document_text = textract.process(path_to_Doc)
    return document_text
Esempio n. 58
0
def read_pdf_as_text(path):
    return textract.process(path)
Esempio n. 59
0
def process_text(file, extension=None):
    if not extension:
        text = textract.process(file)
    else:
        text = textract.process(file, extension=extension)
    return text.decode()
Esempio n. 60
0
import textract

#############################
#convert the files(5articles)
#############################
text = textract.process("test.pdf", m='pdfminer')

search = "Predicting"  #?????


###################
#ignore the useless character
###################
def delect_special(a):  #delete all non-meaningful words
    b = a.replace(".", "").replace("!", "").replace("@", "").replace(
        "#", "").replace("~", "").replace(",", "")
    return b


search1 = delect_special(search).split(" ")  #split article
#print(search1)
#######################
#open and write the txt file
#######################

xml = open(
    'Cobelli1979_Identifiability_of_compartmental_systems_and_related_structural_properties.txt',
    'w')

for i in xrange(len(text)):
    xml.write(text[i])