def extract_text(dfki_xml_fpath, output_fpath): # This function extracts text from the DFKI xml file # generated by their crawler. The function returns # a concatenation of filename, category, tags # Open files output_file = open(output_fpath, "w") tree = ElementTree.parse(dfki_xml_fpath) # Extract text from each xml element <mediaobject> mo_text = "" mo_count = 0 for element in tree.getiterator(): if element.text is None: continue if element.tag == "mediaobject": if mo_text == "": continue output_file.write(mo_text + " " + lemmatizer.SEPARATOR + "\n") mo_text = "" mo_count = mo_count + 1 if V: print "====" elif element.tag == "filename": filename = clean_text(element.text) mo_text = mo_text + " " + filename + " . " # + "###" if V: print element.tag, "=", filename elif (element.tag == "category" or element.tag == "tag") and (element.text.lower() != "none"): tag = clean_text(element.text.lower().strip()) mo_text = mo_text + " " + tag if V: print element.tag, "=", tag output_file.close() print "Text from", mo_count, "mediaobjects was extracted from", dfki_xml_fpath, "to", output_fpath
def cleanup_dbpedia_titles(dbpedia_file, output_file): # Performs preprocessing of the DBPedia abstracts file # and writes it to the output file in the format "title\n" # Open the files dbpedia_file = open(dbpedia_file, "r") output_file = open(output_file,"w") # Read DBPedia definitions line by line titles_written_count = 0 titles_total_count = 0 for line in dbpedia_file: # Skip the void lines if line.strip() == "": continue titles_total_count = titles_total_count + 1 match = re.search(">\s+\"([^\"]+)\"@en\s+<http://", line) if match: # Get the title text from the line title = match.group(1) title = clean_text(title) if (title == "" or skip_title(title)): if V: print "DELETED", title continue # Write the title to the output file output_file.write(title + " " + lemmatizer.SEPARATOR + "\n") titles_written_count = titles_written_count + 1 if V: print title else: continue print titles_written_count, "definitions were written out of", titles_total_count, " definitions in the input file." # Save the changes dbpedia_file.close() output_file.close()