def extract_email_by_re(self): email = '' try: emailgrp = re.search("(\w+-*[.|\w]*)*@(\w+[.])*\w+",self.text) if emailgrp: email = emailgrp.group() except Exception,e: cprint("FAIL","Seaching email timeout,maybe there is no email info in this resume,please check it by yourself.")
def main(): if(len(sys.argv)!= 2): bcolors.cprint('WARNING',"usage: python main.py input_file") sys.exit(1) filename = sys.argv[1] cprint('HEADER',"Input file is "+filename) filelist = sys.argv[1].split('.') cprint('HEADER',"File suffix is "+filelist[-1]) efactory = ExtractFactory(filename,filelist[-1]) efactory.extract_control()
def store_2_db(self,name,email,phone,filepath,filetext): cprint("HEADER",'**************************************') cprint("OK",name) cprint("OK",email) cprint("OK",phone) cprint("OK",filepath) cprint("HEADER",'**************************************') #store file cprint("OK","File size : "+str(os.path.getsize( filepath )) + " bytes" ) fileID = GFS.fs.put( open( filepath, 'rb') ) cprint("OK","File ID MD5 : "+str(fileID) ) ### # out = fs.get(fileID) # output = open(name+email+'.doc', 'wb') # output.write(out.read()) # output.close() # print out.length ### #get one collection # print GFS.db.collection_names() posts = GFS.db.cvtext strlist = filepath.split('.') filename=name+'_'+email+'_'+phone+'.'+strlist.pop() cprint("OK","Create new file name : "+filename) post = {"name": name,"email": email,"phone": phone,"filename":filename,"filetext": filetext,"fileid":fileID} cprint("OK","Record MD5 : "+str(posts.insert(post)) )
def __init__(self): cprint("HEADER","__init__") GFS._connect() cprint("HEADER","server info " + " * " * 40) cprint("HEADER",str(GFS.conn.server_info) ) cprint("HEADER","server info " + " * " * 40)
def extract_control(self): gfs = GFS() if self.suffix == "pdf": cprint("HEADER", "Enter extract pdf...") pdf = ExtractPdf(self.filename) return_list = pdf.extract_text_from_pdf1() if return_list[0] == 0: print pdf.get_all_text() cprint("OK", return_list[1]) # extract name, email, phone number info_list = self.extract_info(pdf.text) # import mongoDB if info_list[0] and info_list[1] and info_list[2]: gfs.store_2_db(info_list[0], info_list[1], info_list[2], self.filename, pdf.text) elif return_list[0] == -1: cprint( "WARNING", return_list[1] + ",make sure the pdf file is not produced by scanned images and no space/special characters in file name as well.", ) else: cprint("FAIL", return_list[1]) elif self.suffix == "docx": cprint("HEADER", "Enter extract docx...") docx = ExtractDocx(self.filename) return_list = docx.extract_text_from_docx1() if return_list[0] == 0: print docx.get_all_text() cprint("OK", return_list[1]) # extract name, email, phone number info_list = self.extract_info(docx.text) # import mongoDB if info_list[0] and info_list[1] and info_list[2]: gfs.store_2_db(info_list[0], info_list[1], info_list[2], self.filename, docx.text) elif return_list[0] == -1: cprint( "WARNING", return_list[1] + ",make sure the file is not empty and no space/special characters in file name as well.", ) else: cprint("FAIL", return_list[1]) elif self.suffix == "txt": cprint("HEADER", "Enter extract txt...") txt = ExtractTxt(self.filename) return_list = txt.extract_text_from_txt() if return_list[0] == 0: # print txt.get_all_text() cprint("OK", return_list[1]) # extract name, email, phone number info_list = self.extract_info(txt.text) # import mongoDB if info_list[0] and info_list[1] and info_list[2]: gfs.store_2_db(info_list[0], info_list[1], info_list[2], self.filename, txt.text) elif return_list[0] == -1: cprint( "WARNING", return_list[1] + ",make sure the file is not empty and no space/special characters in file name as well.", ) else: cprint("FAIL", return_list[1]) elif self.suffix == "doc": cprint("HEADER", "Enter extract doc...") doc = ExtractDoc(self.filename) return_list = doc.extract_text_from_doc() if return_list[0] == 0: print doc.get_all_text() cprint("OK", return_list[1]) # extract name, email, phone number info_list = self.extract_info(doc.text) # import mongoDB if info_list[0] and info_list[1] and info_list[2]: gfs.store_2_db(info_list[0], info_list[1], info_list[2], self.filename, doc.text) elif return_list[0] == -1: cprint( "WARNING", return_list[1] + ",make sure the file is not empty and no space/special characters in file name as well.", ) else: cprint("FAIL", return_list[1]) else: cprint( "WARNING", "Warning:Input a wrong formated file,currently,this tool only accept .pdf/.docx/.doc/.txt" )
def extract_info(self, text): name = "tester" phonenum = "" email = "" # extract phone number ps = PhoneStrategy(text) phone = ps.extract_phone_interface() if phone: cprint("OK", "Phone:" + phone) phonenum = phone else: cprint("FAIL", "Extract phone number failed") # extract email es = EmailStrategy(text) email_tmp = es.extract_email_interface() if email_tmp: cprint("OK", "Email:" + email_tmp) email = email_tmp else: cprint("FAIL", "Extract email failed") # extract name ns = NameStrategy(text) name_tmp = ns.extract_name_interface() if name_tmp: cprint("OK", "Name:" + name_tmp) name = name_tmp else: cprint("FAIL", "Extract name failed") return [name, email, phonenum]