def extract_email_by_re(self):
    email = ''
    try:
      emailgrp = re.search("(\w+-*[.|\w]*)*@(\w+[.])*\w+",self.text)
      if emailgrp:
         email = emailgrp.group()
    except Exception,e:
      cprint("FAIL","Seaching email timeout,maybe there is no email info in this resume,please check it by yourself.")
Beispiel #2
0
def main():

  if(len(sys.argv)!= 2):
    bcolors.cprint('WARNING',"usage: python main.py input_file")
    sys.exit(1)

  filename = sys.argv[1]
  cprint('HEADER',"Input file is "+filename)
  filelist = sys.argv[1].split('.')
  cprint('HEADER',"File suffix is "+filelist[-1])
  efactory = ExtractFactory(filename,filelist[-1])
  efactory.extract_control()
Beispiel #3
0
    def store_2_db(self,name,email,phone,filepath,filetext):
       cprint("HEADER",'**************************************')
       cprint("OK",name)
       cprint("OK",email)
       cprint("OK",phone)
       cprint("OK",filepath)
       cprint("HEADER",'**************************************')
       #store file
       cprint("OK","File size : "+str(os.path.getsize( filepath )) + " bytes" )
       fileID = GFS.fs.put( open( filepath, 'rb')  )
       cprint("OK","File ID MD5 : "+str(fileID) )
### 
#       out = fs.get(fileID)
#       output = open(name+email+'.doc', 'wb')
#       output.write(out.read())
#       output.close() 
#       print out.length
### 
      #get one collection
      # print GFS.db.collection_names() 
       posts = GFS.db.cvtext
       strlist = filepath.split('.')
       filename=name+'_'+email+'_'+phone+'.'+strlist.pop()
       cprint("OK","Create new file name : "+filename)
       post = {"name": name,"email": email,"phone": phone,"filename":filename,"filetext": filetext,"fileid":fileID}
       cprint("OK","Record MD5 : "+str(posts.insert(post)) )
Beispiel #4
0
 def __init__(self):
     cprint("HEADER","__init__")
     GFS._connect()
     cprint("HEADER","server info " + " * " * 40)
     cprint("HEADER",str(GFS.conn.server_info) )
     cprint("HEADER","server info " + " * " * 40)
    def extract_control(self):
        gfs = GFS()
        if self.suffix == "pdf":
            cprint("HEADER", "Enter extract pdf...")
            pdf = ExtractPdf(self.filename)
            return_list = pdf.extract_text_from_pdf1()
            if return_list[0] == 0:
                print pdf.get_all_text()
                cprint("OK", return_list[1])
                # extract name, email, phone number
                info_list = self.extract_info(pdf.text)
                # import mongoDB
                if info_list[0] and info_list[1] and info_list[2]:
                    gfs.store_2_db(info_list[0], info_list[1], info_list[2], self.filename, pdf.text)

            elif return_list[0] == -1:
                cprint(
                    "WARNING",
                    return_list[1]
                    + ",make sure the pdf file is not produced by scanned images and no space/special characters in file name as well.",
                )
            else:
                cprint("FAIL", return_list[1])

        elif self.suffix == "docx":
            cprint("HEADER", "Enter extract docx...")
            docx = ExtractDocx(self.filename)
            return_list = docx.extract_text_from_docx1()
            if return_list[0] == 0:
                print docx.get_all_text()
                cprint("OK", return_list[1])
                # extract name, email, phone number
                info_list = self.extract_info(docx.text)
                # import mongoDB
                if info_list[0] and info_list[1] and info_list[2]:
                    gfs.store_2_db(info_list[0], info_list[1], info_list[2], self.filename, docx.text)

            elif return_list[0] == -1:
                cprint(
                    "WARNING",
                    return_list[1]
                    + ",make sure the file is not empty and no space/special characters in file name as well.",
                )
            else:
                cprint("FAIL", return_list[1])

        elif self.suffix == "txt":
            cprint("HEADER", "Enter extract txt...")
            txt = ExtractTxt(self.filename)
            return_list = txt.extract_text_from_txt()
            if return_list[0] == 0:
                # print txt.get_all_text()
                cprint("OK", return_list[1])
                # extract name, email, phone number
                info_list = self.extract_info(txt.text)
                # import mongoDB
                if info_list[0] and info_list[1] and info_list[2]:
                    gfs.store_2_db(info_list[0], info_list[1], info_list[2], self.filename, txt.text)

            elif return_list[0] == -1:
                cprint(
                    "WARNING",
                    return_list[1]
                    + ",make sure the file is not empty and no space/special characters in file name as well.",
                )
            else:
                cprint("FAIL", return_list[1])

        elif self.suffix == "doc":
            cprint("HEADER", "Enter extract doc...")
            doc = ExtractDoc(self.filename)
            return_list = doc.extract_text_from_doc()
            if return_list[0] == 0:
                print doc.get_all_text()
                cprint("OK", return_list[1])
                # extract name, email, phone number
                info_list = self.extract_info(doc.text)
                # import mongoDB
                if info_list[0] and info_list[1] and info_list[2]:
                    gfs.store_2_db(info_list[0], info_list[1], info_list[2], self.filename, doc.text)
            elif return_list[0] == -1:
                cprint(
                    "WARNING",
                    return_list[1]
                    + ",make sure the file is not empty and no space/special characters in file name as well.",
                )
            else:
                cprint("FAIL", return_list[1])
        else:
            cprint(
                "WARNING", "Warning:Input a wrong formated file,currently,this tool only accept .pdf/.docx/.doc/.txt"
            )
    def extract_info(self, text):
        name = "tester"
        phonenum = ""
        email = ""
        # extract phone number
        ps = PhoneStrategy(text)
        phone = ps.extract_phone_interface()
        if phone:
            cprint("OK", "Phone:" + phone)
            phonenum = phone
        else:
            cprint("FAIL", "Extract phone number failed")

        # extract email
        es = EmailStrategy(text)
        email_tmp = es.extract_email_interface()
        if email_tmp:
            cprint("OK", "Email:" + email_tmp)
            email = email_tmp
        else:
            cprint("FAIL", "Extract email failed")

        # extract name
        ns = NameStrategy(text)
        name_tmp = ns.extract_name_interface()
        if name_tmp:
            cprint("OK", "Name:" + name_tmp)
            name = name_tmp
        else:
            cprint("FAIL", "Extract name failed")
        return [name, email, phonenum]