def cleanGhr(): fields = [ ["title", [["id","tab1"]]], \ ["reviewdate", [["class","datestamp","0"]]], \ ["publishdate", [["class","datestamp","1"]]], \ ["description", [["class","allcontent","0"]]]] required = ([0,3], []) # lists for "mandatory" and "oneof" doc_no = html2trec("../../data/ghr-data/", "ghr", "filelist", fields, required)
def cleanMadisons(): fields = [ ["title", [["class","headerpaneopen","0"]]], \ ["createdate", [["class","createdate","0"]]], \ ["modifydate", [["class","modifydate","0"]]], \ ["description", [["class","mpowerwrapper","0"]]]] required = ([0,3], []) # lists for "mandatory" and "oneof" doc_no = html2trec("../../data/madisons-data/", "madisons", "filelist", fields, required)
def cleanGard(): default_desc = 'These Web pages are updated as the Genetic and Rare Diseases Information Center receives questions and as new information becomes available.' fields = [ ["title", [["id","lblTitle"]]], \ ["synonyms", [["id","divSynonymColumn"]]], \ ["description", [["id","lblDescriptionQuestion"]], default_desc], \ ["references", [["id","divReferences"]]]] required = ([0], [1,2]) # lists for "mandatory" and "oneof" doc_no = html2trec("../../data/gard-data/", "gard", "filelist", fields, required)
def cleanHon(): fields = [ ["title", [["class","input","5"],["xpath","option","0"]]], \ ["description", [["class","txtbeige","0"]]]] required = ([0,1], []) # lists for "mandatory" and "oneof" doc_no = html2trec("../../data/hon-data/", "hon", "filelist", fields, required)