def init(): try: createconnection() universal.con.execute( "create table if not exists " + universal.tablename + "(" + "Application_No varchar(50)," + "Date_of_filing_of_Application DATE," + "Publication_Date DATE," + "Name_of_Applicant varchar(1000)," + "Title_of_Invention varchar(1000)," + "Name_of_Inventor varchar(1500)," + "Abstract varchar(3500)," + "No_of_Pages INT," + "No_of_Claims INT," + "International_Classification varchar(100)," + "Priority_Document_No varchar(70)," + "Priority_date varchar(100)," + "Name_of_Priority_country varchar(70)," + "International_Publication_No varchar(70)," + "International_Application_No varchar(70)," + "International_Application_No_filing_date DATE," + "Patent_of_addition_to_Application_No varchar(70)," + "Patent_of_addition_to_Application_No_filing_date DATE," + "Divisional_Application_No varchar(100)," + "Divisional_Application_No_filing_date DATE" + ")") universal.con.commit() except Exception as e: logwriter.logwrite(str(e)) universal.logflag = 1 finally: closeconnection()
def createconnection(): try: universal.con = _mysql.connect(universal.host,universal.user,universal.password) universal.con.query("use "+ universal.dbname) except _mysql.Error, e: logwriter.logwrite(str(e)) universal.logflag = 1
def loop(): try: transform("Date of filing of Application") transform("Publication Date") transform("Priority Date") transform("IAFiling Date") transform("IBFiling Date") transform("ICFiling Date") q = ('insert into '+universal.tablename+' values("'+ universal.data["Application No."]+'","'+ universal.data["Date of filing of Application"]+'","'+ universal.data["Publication Date"]+'","'+ universal.data["Name of Applicant"]+'","'+ universal.data["Title of the invention"]+'","'+ universal.data["Name of Inventor"]+'","'+ universal.data["Abstract"]+'","'+ universal.data["No. of Pages"]+'","'+ universal.data["No. of Claims"]+'","'+ universal.data["International classification"]+'","'+ universal.data["Priority Document No"]+'","'+ universal.data["Priority Date"]+'","'+ universal.data["Name of priority country"]+'","'+ universal.data["International Publication No"]+'","'+ universal.data["International Application No"]+'","'+ universal.data["IAFiling Date"]+'","'+ universal.data["Patent of Addition to Application Number"]+'","'+ universal.data["IBFiling Date"]+'","'+ universal.data["Divisional to Application Number"]+'","'+ universal.data["ICFiling Date"]+'")') universal.con.query(q) #print(q) except Exception as e: logwriter.logwrite(str(e)) universal.logflag = 1
def init(): try: universal.con = _mysql.connect(universal.host,universal.user,universal.password) universal.con.query("create database if not exists "+universal.dbname) universal.con.query("use "+universal.dbname) universal.con.query("create table if not exists "+universal.tablename+"("+ "Application_No varchar(50),"+ "Date_of_filing_of_Application DATE,"+ "Publication_Date DATE,"+ "Name_of_Applicant varchar(1000),"+ "Title_of_Invention varchar(1000),"+ "Name_of_Inventor varchar(1500),"+ "Abstract varchar(3500),"+ "No_of_Pages varchar(30),"+ "No_of_Claims varchar(30),"+ "International_Classification varchar(50),"+ "Priority_Document_No varchar(50),"+ "Priority_date DATE,"+ "Name_of_Priority_country varchar(30),"+ "International_Publication_No varchar(30),"+ "International_Application_No varchar(30),"+ "International_Application_No_filing_date DATE,"+ "Patent_of_addition_to_Application_No varchar(30),"+ "Patent_of_addition_to_Application_No_filing_date DATE,"+ "Divisional_Application_No varchar(30),"+ "Divisional_Application_No_filing_date DATE"+ ")") except Exception as e: logwriter.logwrite(str(e)) universal.logflag = 1 finally: closeconnection()
def loop(): try: transform("Date of filing of Application") transform("Publication Date") transform("Priority Date") transform("IAFiling Date") transform("IBFiling Date") transform("ICFiling Date") if (universal.data["No. of Pages"] == "NA"): universal.data["No. of Pages"] = '0' if (universal.data["No. of Claims"] == "NA"): universal.data["No. of Claims"] = '0' ## if(is_ascii(universal.data["Name of Applicant"])==False): ## temp = universal.data["Name of Applicant"] ## universal.data["Name of Applicant"]=unicodedata.normalize('NFKD',temp).encode('ascii','ignore') ## if(is_ascii(universal.data["Title of the invention"])==False): ## temp1 = universal.data["Title of the invention"] ## universal.data["Title of the invention"]=unicodedata.normalize('NFKD',temp1).encode('ascii','ignore') ## if(is_ascii(universal.data["Name of Inventor"])==False): ## temp2 = universal.data["Name of Inventor"] ## universal.data["Name of Inventor"]=unicodedata.normalize('NFKD',temp2).encode('ascii','ignore') ## if(is_ascii(universal.data["Abstract"])==False): ## temp3=universal.data["Abstract"] ## universal.data["Abstract"]=unicodedata.normalize('NFKD',temp3).encode('ascii','ignore') ## if(is_ascii(universal.data["Name of priority country"])==False): ## temp4=universal.data["Name of priority country"] ## universal.data["Name of priority country"]=unicodedata.normalize('NFKD',temp4).encode('ascii','ignore') q = ('insert into ' + universal.tablename + ' values("' + universal.data["Application No."] + '","' + universal.data["Date of filing of Application"] + '","' + universal.data["Publication Date"] + '","' + universal.data["Name of Applicant"] + '","' + universal.data["Title of the invention"] + '","' + universal.data["Name of Inventor"] + '","' + universal.data["Abstract"] + '","' + universal.data["No. of Pages"] + '","' + universal.data["No. of Claims"] + '","' + universal.data["International classification"] + '","' + universal.data["Priority Document No"] + '","' + universal.data["Priority Date"] + '","' + universal.data["Name of priority country"] + '","' + universal.data["International Publication No"] + '","' + universal.data["International Application No"] + '","' + universal.data["IAFiling Date"] + '","' + universal.data["Patent of Addition to Application Number"] + '","' + universal.data["IBFiling Date"] + '","' + universal.data["Divisional to Application Number"] + '","' + universal.data["ICFiling Date"] + '")') universal.con.execute(q) universal.con.commit() #print(q) except Exception as e: logwriter.logwrite("MySQL: " + str(e) + " on page " + str(int(universal.filename) + 1)) universal.logflag = 1 Format = universal.workbook.add_format() Format.set_font_color('red') universal.worksheet.set_row(universal.row, None, Format)
def initial(): run_command("mkdir " + universal.pdf_folder) run_command("mkdir " + universal.tag_folder) temp = universal.filename #assigning filename to temp no_of_pages = burstpdf() logwriter.logwrite("\n********" + "\n" + temp + "\n*************\n") if no_of_pages == 0: logwriter.logwrite("No pages in this pdf\n") logwriter.logwrite("********" + "\n" + temp + "\n*************\n") return 0 i = 0 excelwriter.init() while i < no_of_pages: #loop for locating first patent file universal.filename = str(i) convert.convert() #for initializing conversion of files i += 1 if parser.begin() != -1: excelwriter.loop() mysql.loop() break universal.flag = 1 #Process of extraction will start print(universal.con) while i < no_of_pages: universal.filename = str(i) convert.convert() #for initializing conversion of files if parser.begin() == -1: i += 1 continue excelwriter.loop() mysql.loop() i += 1 universal.workbook.close() run_command("rm -r " + universal.pdf_folder) run_command("rm -r " + universal.tag_folder) logwriter.logwrite("********" + "\n" + temp + "\n*************\n")
def transform(tag): f = "%d/%m/%Y" s = str(universal.data[tag]) if (s == "NA"): s = "01/01/0001" try: ss = datetime.datetime.strptime(s, f) except: s = "01/01/0001" ss = datetime.datetime.strptime(s, f) logwriter.logwrite("MYSQL: " + tag + "-" + universal.data[tag] + " is not in format on page " + str(int(universal.filename) + 1)) universal.data[tag] = str(ss.year) + "/" + str(ss.month) + "/" + str( ss.day)
def begin(): #return 1 if string is not present universal.datastring="" reopen(universal.filename+universal.filename+".html") #html-tag filename converted from pdf #page = requests_session.get('file:///home/killerbee/Desktop/test2/'+filename) #file name #universal.tree = html.fromstring(page.content) s = universal.tree.itertext() # universal.test=["(21) Application No","Date of filing of Application","Publication Date","Title of the invention","International classification","Priority Document","Priority Date","Name of priority country","International Application","Fil","International Publication","Patent of Addition to Application","Fil","Divisional to Application","Fil","Name of Applicant","(72)Name of Inventor","Abstract"] for a in s: universal.datastring += a try: return(extractor.getdetails(universal.datastring)) except Exception as e: logwriter.logwrite("Extracter: "+str(e)+" on page "+str(int(universal.filename)+1)) universal.logflag = 1 return -1 return 0
def getdetails(new_patent):#new_patent must have spaces b/w consecutive words patent="" data.clear() flag.clear() del indexvalues[:] for j in range (0,len(new_patent)): if(new_patent[j]=='('): if(new_patent[j+1].isdigit()): patent+=" (" elif(new_patent[j]==')'): if(new_patent[j-1].isdigit()): patent+=") " else: patent+=new_patent[j] temp_patent = patent #temp_patent is the original string with capital letters patent = patent.lower() words = patent.split()#word(list) contains lower letter string if(check(words)==True):#check checks if pdf has patent tagindex=0 for tag in Tags: flag[tag]=0#flag[tag]=1 implies value to tag has been assigned pages,claims = getnoofpagesandclaims(temp_patent) #here we are going to give values to no of pages and no of claims tags and insert the starting and ending index in indexvalues if(bool(pages.start!=-1) & bool(pages.end!=-1)): indexvalues.append(Tag(pages_tag,pages.start,pages.end))#flag has been asigned 1 already in getnoofpagesandclaims if(bool(claims.start!=-1) & bool(claims.end!=-1)): indexvalues.append(Tag(claims_tag,claims.start,claims.end))#flag has been asigned 1 already in getnoofpagesandclaims while tagindex<len(Tags): tag=Tags[tagindex] if(flag[tag]!=1): i = searchtag(words,tag)#i recieve pair of tag(start,end) #print(tag+" "+str(words[i.start:i.end+1])) if(i==Pair(-1,-1)): (a,b)=locate(tag,words,0.85) logwriter.logwrite(tag+" locate function") if a!=-1: i=searchtag(words,b) #print(tag+" "+str(i)) if(i==Pair(-1,-1)): data[tag]="NA" #log writer logwriter.logwrite(tag+" not found") #print(tag) flag[tag]=1 else : data[tag]="NA" #log writer logwriter.logwrite(tag+" not found") #print(tag) flag[tag]=1 #print(tag+' '+"".join(words[i.start:i.end])) #problem with tracking of start and end index of tags if(flag[tag]==0):#flag[tag]=1 implies value to tag has been assigned indexvalues.append(Tag(tag,i.start,i.end)) #print(str(tag)+" "+str(flag[tag])) tagindex+=1 #print(data) extractvalues(temp_patent.split()) return 1; else: return -1;
def begin(): #return 1 if string is not present universal.datastring = "" reopen(universal.filename + universal.filename + ".html") #html-tag filename converted from pdf #page = requests_session.get('file:///home/killerbee/Desktop/test2/'+filename) #file name #universal.tree = html.fromstring(page.content) s = universal.tree.itertext() # universal.test=["(21) Application No","Date of filing of Application","Publication Date","Title of the invention","International classification","Priority Document","Priority Date","Name of priority country","International Application","Fil","International Publication","Patent of Addition to Application","Fil","Divisional to Application","Fil","Name of Applicant","(72)Name of Inventor","Abstract"] logwriter.logwrite("***************" + universal.filename + "*************") for a in s: universal.datastring += a try: return (extractor.getdetails(universal.datastring)) except Exception as e: logwriter.logwrite(e) return -1 # write code for case when tayal returns -1 and you have to run your extraction function # implement ur extraction function and then call it #extractor.getdetails(universal.datastring) # for tag in universal.test: # tempi=i # # i=extractor(i,tag) # if i==-1: # if(extractor.mycheck(universal.datastring)==0): # fappend=open("log.txt",'a') # fappend.write("-->"+str(universal.filename)+"->"+tag+"--->"+universal.datastring[tempi:tempi+len(tag)]+'\n') # fappend.close() # return -1 # i+=1 return 0
def initial(): #run_command("mkdir "+universal.pdf_folder) try: os.mkdir(universal.pdf_folder) os.mkdir(universal.tag_folder) except Exception as e: shutil.rmtree(universal.pdf_folder) shutil.rmtree(universal.tag_folder) os.mkdir(universal.pdf_folder) os.mkdir(universal.tag_folder) temp = universal.filename #assigning filename to temp no_of_pages = burstpdf() logwriter.logwrite("\n********" + "\n" + str(temp) + "\n*************\n") if no_of_pages == 0: logwriter.logwrite("No pages in this pdf\n") logwriter.logwrite("********" + "\n" + str(temp) + "\n*************\n") return 0 i = 0 excelwriter.init() while i < no_of_pages: #loop for locating first patent file universal.filename = str(i) convert.convert() #for initializing conversion of files i += 1 if (Parser.begin() != -1): excelwriter.loop() sqlitewriter.loop() break universal.flag = 1 #Process of extraction will start #print (universal.con) while i < no_of_pages: universal.filename = str(i) convert.convert() #for initializing conversion of files if (Parser.begin() == -1): i += 1 continue excelwriter.loop() sqlitewriter.loop() i += 1 universal.workbook.close() #run_command("rm -r "+universal.pdf_folder) shutil.rmtree(universal.pdf_folder) #run_command("rm -r "+universal.tag_folder) shutil.rmtree(universal.tag_folder) logwriter.logwrite("********" + "\n" + str(temp) + "\n*************\n")
def createconnection(): try: universal.con = sqlite3.connect(universal.dbname + ".db") except Exception as e: logwriter.logwrite(str(e)) universal.logflag = 1
def closeconnection(): try: universal.con.close() except Exception as e: logwriter.logwrite(str(e)) universal.logflag = 1
def loop(): try: universal.worksheet.write(universal.row, 0, universal.data["Application No."]) universal.worksheet.write( universal.row, 1, universal.data["Date of filing of Application"], universal.date_format) universal.worksheet.write(universal.row, 2, universal.data["Publication Date"], universal.date_format) universal.worksheet.write(universal.row, 3, universal.data["Name of Applicant"]) universal.worksheet.write(universal.row, 4, universal.data["Title of the invention"]) universal.worksheet.write(universal.row, 5, universal.data["Name of Inventor"]) universal.worksheet.write(universal.row, 6, universal.data["Abstract"]) if (universal.data["No. of Pages"].upper() != "NA"): universal.worksheet.write(universal.row, 7, int(universal.data["No. of Pages"])) else: universal.worksheet.write(universal.row, 7, universal.data["No. of Pages"].upper()) if (universal.data["No. of Claims"].upper() != "NA"): universal.worksheet.write(universal.row, 8, int(universal.data["No. of Claims"])) else: universal.worksheet.write(universal.row, 8, universal.data["No. of Claims"].upper()) universal.worksheet.write( universal.row, 9, universal.data["International classification"]) universal.worksheet.write(universal.row, 10, universal.data["Priority Document No"]) if (universal.data["Priority Date"] == "NA"): universal.worksheet.write(universal.row, 11, universal.data["Priority Date"]) else: universal.worksheet.write(universal.row, 11, universal.data["Priority Date"], universal.date_format) universal.worksheet.write(universal.row, 12, universal.data["Name of priority country"]) universal.worksheet.write( universal.row, 13, universal.data["International Application No"]) if (universal.data["IAFiling Date"] == "NA"): universal.worksheet.write(universal.row, 14, universal.data["IAFiling Date"]) else: universal.worksheet.write(universal.row, 14, universal.data["IAFiling Date"], universal.date_format) universal.worksheet.write( universal.row, 15, universal.data["International Publication No"]) universal.worksheet.write( universal.row, 16, universal.data["Patent of Addition to Application Number"]) if (universal.data["IBFiling Date"] == "NA"): universal.worksheet.write(universal.row, 17, universal.data["IBFiling Date"]) else: universal.worksheet.write(universal.row, 17, universal.data["IBFiling Date"], universal.date_format) universal.worksheet.write( universal.row, 18, universal.data["Divisional to Application Number"]) if (universal.data["ICFiling Date"] == "NA"): universal.worksheet.write(universal.row, 19, universal.data["ICFiling Date"]) else: universal.worksheet.write(universal.row, 19, universal.data["ICFiling Date"], universal.date_format) universal.row = universal.row + 1 except Exception as e: logwriter.logwrite("Excelfile : " + str(e) + " on page " + universal.filename)
def closeconnection(): try: universal.con.close() except _mysql.Error, e: logwriter.logwrite(str(e)) universal.logflag = 1