def parsing_jd(jd_file_name):
    path = file_loc + jd_file_name + '.txt'
    for file in glob.glob(path, recursive=True):
        if not file in job_desc_files: 
            job_desc_files.append(file)
    with open(path, 'rt') as file:
        jd = file.read()
    jd = summarize(jd, word_count=200)
    file.close()
    jd = text_process.normalize(jd)    
    df = pd.DataFrame(columns=['Path', 'File Name', 'Text'])
    df.loc[0] = [path, jd_file_name, jd]
    return df
Exemple #2
0
def extract_text_from_pdf(files_list):
    resumes = [] # Stores final processed resume files 
    for pdf_path in files_list:
        text = ''
        with open(pdf_path, 'rb') as fh:
            # iterate over all pages of PDF document
            for page in PDFPage.get_pages(fh, caching=True, check_extractable=True):
                # creating a resoure manager
                resource_manager = PDFResourceManager()

                # create a file handle
                fake_file_handle = StringIO()

                # creating a text converter object
                converter = TextConverter(
                                    resource_manager, 
                                    fake_file_handle, 
                                    codec='utf-8', 
                                    laparams=LAParams()
                            )

                # creating a page interpreter
                page_interpreter = PDFPageInterpreter(
                                    resource_manager, 
                                    converter
                                )

                # process current page
                page_interpreter.process_page(page)

                # extract text
                text += fake_file_handle.getvalue()
                text = text.replace('\n', ' ')
                
                # close open handles
                converter.close()
                fake_file_handle.close()
            resumes.append(text_process.normalize(text))
            
    for name in resume_list:
        #print(name)
        temp = name.split('.')[0]
        temp = temp.split('/')[1]
        file_names.append(temp)
    df = {'Path':resume_list, 'File Name': file_names, 'Text':resumes}
    data = pd.DataFrame(df)
    data.to_csv('out.csv')
    return data