def extract_text(dfki_xml_fpath, output_fpath):
    # This function extracts text from the DFKI xml file
    # generated by their crawler. The function returns
    # a concatenation of filename, category, tags

    # Open files
    output_file = open(output_fpath, "w")
    tree = ElementTree.parse(dfki_xml_fpath)

    # Extract text from each xml element <mediaobject>
    mo_text = ""
    mo_count = 0
    for element in tree.getiterator():
        if element.text is None:
            continue

        if element.tag == "mediaobject":
            if mo_text == "":
                continue
            output_file.write(mo_text + " " + lemmatizer.SEPARATOR + "\n")

            mo_text = ""
            mo_count = mo_count + 1
            if V:
                print "===="
        elif element.tag == "filename":
            filename = clean_text(element.text)
            mo_text = mo_text + " " + filename + " . "  # + "###"
            if V:
                print element.tag, "=", filename
        elif (element.tag == "category" or element.tag == "tag") and (element.text.lower() != "none"):
            tag = clean_text(element.text.lower().strip())
            mo_text = mo_text + " " + tag
            if V:
                print element.tag, "=", tag

    output_file.close()

    print "Text from", mo_count, "mediaobjects was extracted from", dfki_xml_fpath, "to", output_fpath
def cleanup_dbpedia_titles(dbpedia_file, output_file):
# Performs preprocessing of the DBPedia abstracts file
# and writes it to the output file in the format "title\n"

    # Open the files
    dbpedia_file = open(dbpedia_file, "r")
    output_file = open(output_file,"w")
    
    # Read DBPedia definitions line by line
    titles_written_count = 0
    titles_total_count = 0
    
    for line in dbpedia_file:        
        # Skip the void lines
        if line.strip() == "": continue
        
        titles_total_count = titles_total_count + 1
            
        match = re.search(">\s+\"([^\"]+)\"@en\s+<http://", line)
        if match:
            # Get the title text from the line
            title = match.group(1)
            title = clean_text(title)
            if (title == "" or skip_title(title)): 
                if V: print "DELETED", title
                continue
            
            # Write the title to the output file
            output_file.write(title + " " + lemmatizer.SEPARATOR + "\n")
            titles_written_count = titles_written_count + 1
            if V: print title            
        else: 
            continue
       	
    print titles_written_count, "definitions were written out of", titles_total_count, " definitions in the input file."
            	
    # Save the changes
    dbpedia_file.close()    
    output_file.close()