def writeHeadings(): f = open("all_headings_other_without.txt", 'w') fileset = getFiles("/home/shreya/Wharton/NEW/Other/ONLY_XML") for filepath in fileset: index = filepath.index(".") if "xml" in filepath[index:]: print filepath pt = ParseText(filepath) content = pt.readXmlToString() #print content content_list = pt.readTextToList() heading_indexes, headings = pt.findHeadings(content, content_list, []) #print headings for heading in headings: f.write(heading.encode('ascii', 'ignore') + "\n") f.close()
fileset = getFiles("/home/shreya/Wharton/XML") with open("work_exp.csv", "w") as csvfile: csvwriter = csv.writer(csvfile, delimiter=",") csvwriter.writerow(["Filename", "Company, Position, Duration", "About Job", "EXPERIENCE"]) for xml_filepath in fileset: index = xml_filepath.index(".") if "xml" in xml_filepath[index:]: # print "xml: ", xml_filepath row = [] first_index = xml_filepath.rfind("/") + 1 last_index = xml_filepath.rfind(".") filename = xml_filepath[first_index:last_index] # print "filename: "+filename text_filepath = "/home/shreya/Wharton/PDF_text/" + filename + ".txt" # print "text_filepath: ", text_filepath pt = ParseText(xml_filepath, text_filepath) content = pt.readXmlToString() content_list = pt.readTextToList() heading_indexes, headings = pt.findHeadings(content, content_list, PROBABLE_HEADINGS) # bio = pt.find_bio(content, content_list, headings, heading_indexes) # print "BIO: ", bio # print "HEADINGS: ", headings # edu = pt.find_this(content, content_list, "education", headings, heading_indexes) exp = pt.find_this(content, content_list, "experience", headings, heading_indexes) # print "EDUCATION: ", edu # print "EXPERIENCE: ", exp if not exp: exp = pt.find_this(content, content_list, "history", headings, heading_indexes) split_exp = splitExp(exp)