def read_irsa_query(infile): #print("Reading " + infile) lstValue = [] i = 0 data = infile year = [i for i in xmliter(data, "year")][0] day = [i for i in xmliter(data, "day")][0] for d in tqdm(xmliter(data, "statistics")): d['zody'] = float(d['zody'].replace("(MJy/sr)", "")) d['cib'] = float(d['cib'].replace("(MJy/sr)", "")) d['stars'] = float(d['stars'].replace("(MJy/sr)", "")) d['ism'] = float(d['ism'].replace("(MJy/sr)", "")) d['totbg'] = float(d['totbg'].replace("(MJy/sr)", "")) d['ra'] = float(d['refCoordinate'].split(" ")[0]) d['dec'] = float(d['refCoordinate'].split(" ")[1]) d.pop("refCoordinate") d['year'] = year d['day'] = day lstValue.append(d) i = i + 1 return (lstValue)
def test_parsing_note(xmldata_note, parser): docs = [] for doc in xmliter(xmldata_note, 'note', parsing_method=parser): assert isinstance(doc, dict) docs.append(doc) walk_test(doc) assert len(docs)
def test_parsing_plants(xmldata_plants, parser): docs = [] for doc in xmliter(xmldata_plants, 'PLANT', parsing_method=parser): assert isinstance(doc, dict) docs.append(doc) walk_test(doc) assert len(docs) == 36
def test_parsing_menu(xmldata_menu, parser): docs = [] for doc in xmliter(xmldata_menu, 'food', parsing_method=parser): assert isinstance(doc, dict) docs.append(doc) walk_test(doc) assert len(docs) == 5
def test_parsing_cd(xmldata_cd, parser): docs = [] for doc in xmliter(xmldata_cd, 'CD', parsing_method=parser): assert isinstance(doc, dict) docs.append(doc) walk_test(doc) assert len(docs) == 26
def parse(cls, file_path): """ generator parser :param file_path: path to xml file :return: yields artist items """ for data in xmliter(file_path, 'artist'): yield cls(data)
def test_parsing_test_doc(parser): f = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'test_doc.xml') docs = [] for doc in xmliter(f, 'AnItem', parsing_method=parser): assert isinstance(doc, dict) docs.append(doc) walk_test(doc) assert len(docs) == 3
def parse(cls, file_path): """ generator parser :param file_path: path to xml file :return: yields Release items """ for data in xmliter(file_path, 'release'): yield cls(data)
def test_parsing_google_renewal_data_1(parser): f = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'google-renewals-subset-20080624.xml') docs = [] for doc in xmliter(f, 'Record', parsing_method=parser): assert isinstance(doc, dict) docs.append(doc) walk_test(doc) assert len(docs) == 4
def read_badges(xml_badges_file_path): map_user_badges = {} if xml_badges_file_path is not None: for attr_dic in xmliter(xml_badges_file_path, 'row'): user_id = int(attr_dic["@UserId"]) class_type = int((attr_dic["@Class"][0])) date = (attr_dic["@Date"]) if user_id in map_user_badges: map_user_badges[user_id].append((class_type, date)) else: map_user_badges[user_id] = [(class_type, date)] return map_user_badges
def __init__(self, xml_user_file_path, xml_badges_file_path): self.map_of_user = {} map_user_badges = UserParserRecord.read_badges(xml_badges_file_path) for attr_dic in xmliter(xml_user_file_path, 'row'): user_id = int(attr_dic["@Id"]) creation_date = None age = None location = None reputation = None views = None about_me = None up_votes = None down_votes = None website_url = None last_access_date = None display_name = None if "@CreationDate" in attr_dic: creation_date = (attr_dic["@CreationDate"]) if "@Age" in attr_dic: age = int(attr_dic["@Age"]) if "@Location" in attr_dic: location = (attr_dic["@Location"]) if "@Reputation" in attr_dic: reputation = int(attr_dic["@Reputation"]) if "@Views" in attr_dic: last_access_date = int(attr_dic["@Views"]) if "@WebsiteUrl" in attr_dic: website_url = (attr_dic["@WebsiteUrl"]) if "@DownVotes" in attr_dic: down_votes = int(attr_dic["@DownVotes"]) if "@UpVotes" in attr_dic: up_votes = int(attr_dic["@UpVotes"]) if "@AboutMe" in attr_dic: about_me = (attr_dic["@AboutMe"]) if "@LastAccessDate" in attr_dic: last_access_date = (attr_dic["@LastAccessDate"]) if "@DisplayName" in attr_dic: display_name = (attr_dic["@DisplayName"]) lst_badges = None if user_id in map_user_badges: lst_badges = map_user_badges[user_id] user = User(user_id, reputation, age, location, creation_date, views, lst_badges, about_me, up_votes, down_votes, website_url, last_access_date, display_name) self.map_of_user[user_id] = user
def __init__(self, xml_post_link_file_path): self.map_duplicate_posts = {} self.map_related_posts = {} for attr_dic in xmliter(xml_post_link_file_path, 'row'): post_id = int(attr_dic["@PostId"]) related_post_id = int(attr_dic["@RelatedPostId"]) link_type_id = int(attr_dic["@LinkTypeId"]) if link_type_id == 3: # Duplicate if post_id in self.map_duplicate_posts: self.map_duplicate_posts[post_id].append(related_post_id) else: self.map_duplicate_posts[post_id] = [related_post_id] elif link_type_id == 1: # Related if post_id in self.map_related_posts: self.map_related_posts[post_id].append(related_post_id) else: self.map_related_posts[post_id] = [related_post_id]
def getLines(InFileName, outfileName, maxLines, startdate=None, enddate=None): f = open(outfileName, 'w', encoding='utf8') f.write("<HTML><BODY>\n") count = 0 if maxLines == None: maxLines = -1 DateStart = None DateEnd = None if (startdate != None): DateStart = datetime.datetime.strptime(startdate, "%Y-%m-%d") if (enddate != None): DateEnd = datetime.datetime.strptime(enddate, "%Y-%m-%d") for d in xmliter(InFileName, 'item'): if maxLines == -1 or count < maxLines: if (d['{http://wordpress.org/export/1.2/}post_type'] == 'post'): if (d['title'] != None): title = "<H2>" + d['title'] + "</h2>" if (d['pubDate'] != None): date_time_obj = datetime.datetime.strptime( d['pubDate'], "%a, %d %b %Y %H:%M:%S %z") date_time_obj = date_time_obj.replace(tzinfo=None) if ((DateStart != None) and (DateStart < date_time_obj)) or (DateStart == None): if ((DateStart != None) and (DateEnd > date_time_obj)) or (DateEnd == None): f.write(title + "\n") dateStr = '<b>' + date_time_obj.date( ).strftime("%Y-%m-%d") + '</b>' f.write(dateStr + "\n") itemString = "<p>" + d[ '{http://purl.org/rss/1.0/modules/content/}encoded'].replace( '\n\n', '</p><p>\n') + '</p>' f.write(itemString + "\n") count = count + 1 f.write('\n</BODY></HTML>') f.close()
def __init__(self, xml_vote_file_path): self.map_of_votes = {} for attr_dic in xmliter(xml_vote_file_path, 'row'): id = int(attr_dic["@Id"]) post_id = int(attr_dic["@PostId"]) vote_type_id = int(attr_dic["@VoteTypeId"]) user_id = None bounty_amount = None creation_date = None if "@UserId" in attr_dic: user_id = int(attr_dic["@UserId"]) if "@BountyAmount" in attr_dic: bounty_amount = int(attr_dic["@BountyAmount"]) if "@CreationDate" in attr_dic: creation_date = attr_dic["@CreationDate"] vote = Vote(id, post_id, vote_type_id, user_id, creation_date, bounty_amount) if post_id in self.map_of_votes: self.map_of_votes[post_id].append(vote) else: self.map_of_votes[post_id] = [vote]
def __init__(self, xml_post_history_file_path): self.map_of_edits = {} for attr_dic in xmliter(xml_post_history_file_path, 'row'): history_id = int(attr_dic["@Id"]) post_id = int(attr_dic["@PostId"]) post_history_type_id = None revision_guid = None user_display_name = None text = None creation_date = None comment = None user_id = None close_reason_id = None if "@RevisionGUID" in attr_dic: revision_guid = attr_dic["@RevisionGUID"] if "@PostHistoryTypeId" in attr_dic: post_history_type_id = int(attr_dic["@PostHistoryTypeId"]) if "@Comment" in attr_dic: comment = (attr_dic["@Comment"]) if "@UserDisplayName" in attr_dic: user_display_name = (attr_dic["@UserDisplayName"]) if "@CloseReasonId" in attr_dic: close_reason_id = int(attr_dic["@CloseReasonId"]) if "@UserId" in attr_dic: user_id = int(attr_dic["@UserId"]) if "@CreationDate" in attr_dic: creation_date = (attr_dic["@CreationDate"]) if "@Text" in attr_dic: text = (attr_dic["@Text"]) post_history = PostHistory(history_id, post_id, post_history_type_id, revision_guid, creation_date, user_id, user_display_name, comment, text, close_reason_id) if post_id in self.map_of_edits: self.map_of_edits[post_id].append(post_history) else: self.map_of_edits[post_id] = [post_history]
def read_file(self, input_file): line_to_add_begining_of_row = "<?xml version=\"1.0\" encoding=\"utf-8\"?>" + "\n" + "<posts>" line_to_add_ending_of_row = "</posts>" #print question_input_file for line in open(input_file, 'r'): # print(line) xml_doc = line_to_add_begining_of_row + line + line_to_add_ending_of_row temp_xml = "temp_xml.xml" f_temp = open(temp_xml, 'w') f_temp.write(xml_doc) f_temp.flush() f_temp.close() for d in xmliter(temp_xml, 'row'): post_id = d['@Id'].encode("utf-8").strip() post_id = str(post_id.decode("utf-8")) post_type_id = d['@PostTypeId'] if post_type_id == "2": question_id = d['@ParentId'].encode( "utf-8").strip().decode("utf-8") # print(question_id) print("now processing answer with : ", post_id, ", from question with id: ", question_id) post_id = question_id + "_" + post_id else: print("now processing question with id: ", post_id) body = d['@Body'] body_xml_filtered = self.Extract_Text_From_XML(body) annotated_tokenized_text = self.tokenize_and_annotae_post_body( body_xml_filtered, post_id)
def __init__(self, xml_comment_file_path): self.map_of_comments_for_post = {} for attr_dic in xmliter(xml_comment_file_path, 'row'): comment_id = int(attr_dic["@Id"]) post_id = int(attr_dic["@PostId"]) text = (attr_dic["@Text"]) creation_date = None score = None user_id = None if "@Score" in attr_dic: score = int(attr_dic["@Score"]) if "@UserId" in attr_dic: user_id = int(attr_dic["@UserId"]) if "@CreationDate" in attr_dic: creation_date = (attr_dic["@CreationDate"]) comment = Comment(comment_id, post_id, text, score, user_id, creation_date) if post_id in self.map_of_comments_for_post: self.map_of_comments_for_post[post_id].append(comment) else: self.map_of_comments_for_post[post_id] = [comment]
def test_parsing_note_error(xmldata_note_error, parser): with pytest.raises((ParseError, cParseError, XMLSyntaxError), parsing_method=parser): for doc in xmliter(xmldata_note_error, 'note', parsing_method=parser): pass
from xmlr import xmliter input_xml = 'topics.arqmath-2021-task2.origin.xml' output_txt = 'topics.arqmath-2021-task2.txt' with open(output_txt, 'w') as fh: for attrs in xmliter(input_xml, 'Topic'): qid = attrs['@number'] latex = attrs['Latex'] print(f'{qid}\t{latex}', file=fh)
def read_euclid_mission_plan(data): pointValue = [] obsValue = [] i = 0 print("Reading pointing requests") for d in tqdm(xmliter(data, "ObservationRequest")): if not isinstance(d["PointingRequest"], list): d["PointingRequest"] = [d["PointingRequest"]] for i in d["PointingRequest"]: #print(i) i["ObservationType"] = d["ObservationType"] i["MissionPhase"] = d["MissionPhase"] i["SurveyId"] = d["SurveyId"] pointValue = pointValue + d["PointingRequest"] db = pd.DataFrame(pointValue) nrows = len(db.iloc[:, 0]) db_small = pd.DataFrame(index=np.arange(nrows), columns=[ 'ID', 'MissionPhase', 'ObservationType', 'SurveyId', 'MJD2000', 'StartTime', 'Year', 'Day_year', 'Lon', 'Lat', 'RA', 'DEC', 'PA', "exptime", "expstart" ]) db_small["MissionPhase"] = db.iloc[:, ]["MissionPhase"] db_small["ObservationType"] = db.iloc[:, ]["ObservationType"] db_small["SurveyId"] = db.iloc[:, ]["SurveyId"] db_small["Lon"] = np.array( [float(i["Longitude"]) for i in db.iloc[:, ]["Attitude"]]) db_small["Lat"] = np.array( [float(i["Latitude"]) for i in db.iloc[:, ]["Attitude"]]) db_small["PA"] = np.array( [float(i["PositionAngle"]) for i in db.iloc[:, ]["Attitude"]]) print("Transforming coordinates...") gc = SkyCoord(lon=db_small["Lon"] * u.degree, lat=db_small["Lat"] * u.degree, frame='barycentrictrueecliptic') db_small["ID"] = db.iloc[:, 0] db_small["MJD2000"] = np.array(db.iloc[:, ]["Mjd2000"]).astype("float") db_small["StartTime"] = db.iloc[:, ]["StartTime"] # t = [Time(db_small["StartTime"], format='isot', scale='utc') for print("Time dates reshaping...") t = [Time(i, format='isot', scale='utc') for i in db_small["StartTime"]] db_small["Year"] = np.array([i.datetime.year for i in t]) # tt = t.datetime.timetuple() # We use tm_yday transforming t to tt (tuple time) tt = [i.datetime.timetuple() for i in t] db_small["Day_year"] = np.array([i.tm_yday for i in tt]) db_small["RA"] = gc.icrs.ra.degree db_small["DEC"] = gc.icrs.dec.degree db_small["exptime"] = np.array(db.iloc[:, ]["Duration"]).astype("float") db_small["expstart"] = t # Now we add the planets positions in the sky. planets_position = [read_ephemerides(i) for i in planets_list] for i in range(len(planets_list)): print(planets_list[i]) ra_temp, dec_temp = position_planet(ephemeris=planets_position[i], time=db_small["expstart"][:], time_zero=db_small["expstart"][0]) db_small["ra_" + planets_list[i].lower()] = ra_temp db_small["dec_" + planets_list[i].lower()] = dec_temp print("End of line") return (db_small)
import pandas as pd from detectron2.structures import BoxMode from xmlr import xmlparse from xmlr import xmliter from xmlr import xmliter, XMLParsingMethods import xml.etree.ElementTree img_dir = "/mnt/dst_datasets/own_omni_dataset/theodore_v3/images/" import cv2 from matplotlib import pyplot as plt count = 0 df_cols = ["id", "name", 'xtl', 'ytl', 'xbr', 'ybr', "action_label", "grp_id"] rows = [] for d in xmliter( '/mnt/dst_datasets/own_omni_dataset/theodore_v3/theodore_plus_training.xml', 'image'): if count == 45000: record = {} boxes = [] grp = [] grp_id = [] actions = [] for k, v in d.items(): if k == 'actions': for key, val in v.items(): for a in val: actions.append(a["@name"]) grp_id.append(a["@group_id"]) if k == 'box': if type(v) is list:
def __init__(self, xml_post_link_file_path, map_comments=None, map_related_post=None, map_duplicate_post=None, map_votes=None, map_users=None, post_history_parser=None): self.map_questions = {} self.map_answers = {} self.map_just_answers = {} for attr_dic in xmliter(xml_post_link_file_path, 'row'): post_id = int(attr_dic['@Id']) post_type_id = int(attr_dic['@PostTypeId']) creation_date = (attr_dic["@CreationDate"]) body = (attr_dic["@Body"]) view_count = None comment_count = None owner_user_id = None last_edit_date = None last_activity_date = None last_editor_user_id = None community_owned_date = None last_editor_display_name = None score = None user = None if "@ViewCount" in attr_dic: view_count = int(attr_dic["@ViewCount"]) if "@Score" in attr_dic: score = int(attr_dic["@Score"]) if "@CommentCount" in attr_dic: comment_count = int(attr_dic["@CommentCount"]) if "@OwnerUserId" in attr_dic: owner_user_id = int(attr_dic["@OwnerUserId"]) if map_users is not None and owner_user_id in map_users: user = map_users[owner_user_id] if "@LastEditDate" in attr_dic: last_edit_date = (attr_dic["@LastEditDate"]) if "@LastActivityDate" in attr_dic: last_activity_date = (attr_dic["@LastActivityDate"]) if "@LastEditorUserId" in attr_dic: last_editor_user_id = int(attr_dic["@LastEditorUserId"]) if "@CommunityOwnedDate" in attr_dic: community_owned_date = (attr_dic["@CommunityOwnedDate"]) if "@LastEditorDisplayName" in attr_dic: last_editor_display_name = (attr_dic["@LastEditorDisplayName"]) comment_list = None vote_list = None edit_list = None if map_comments is not None and post_id in map_comments: comment_list = map_comments[post_id] if map_votes is not None and post_id in map_votes: vote_list = map_votes[post_id] if post_history_parser is not None and post_id in post_history_parser.map_of_edits: edit_list = post_history_parser.map_of_edits[post_id] if post_type_id == 1: # Question title = (attr_dic["@Title"]) favourite_count = None closed_date = None accepted_answer_id = None related_post = [] if map_related_post is not None and post_id in map_related_post: for related_post_id in map_related_post[post_id]: related_post.append((related_post_id, False)) if map_duplicate_post is not None and post_id in map_duplicate_post: for related_post_id in map_duplicate_post[post_id]: related_post.append((related_post_id, True)) if "@CommentCount" in attr_dic: comment_count = int(attr_dic["@CommentCount"]) if "@AnswerCount" in attr_dic: answer_count = int(attr_dic["@AnswerCount"]) if "@FavoriteCount" in attr_dic: favourite_count = int(attr_dic["@FavoriteCount"]) if "@AcceptedAnswerId" in attr_dic: accepted_answer_id = int(attr_dic["@AcceptedAnswerId"]) if "@ClosedDate" in attr_dic: closed_date = (attr_dic["@ClosedDate"]) if "@Tags" in attr_dic: tags = (attr_dic["@Tags"]).split(">") lst_tags = [] for i in range(0, len(tags) - 1): tag = tags[i][1:] lst_tags.append(tag) self.map_questions[post_id] = Question( post_id, creation_date, score, view_count, body, owner_user_id, comment_count, last_edit_date, last_activity_date, last_editor_user_id, community_owned_date, last_editor_display_name, related_post, comment_list, vote_list, edit_list, user, title, lst_tags, accepted_answer_id, answer_count, favourite_count, closed_date) elif post_type_id == 2: parent_id = int(attr_dic["@ParentId"]) answer = Answer(post_id, creation_date, score, view_count, body, owner_user_id, comment_count, last_edit_date, last_activity_date, last_editor_user_id, community_owned_date, last_editor_display_name, parent_id, comment_list, vote_list, edit_list, user) if parent_id in self.map_answers: self.map_answers[parent_id].append(answer) else: self.map_answers[parent_id] = [answer] self.map_just_answers[answer.post_id] = answer self.__set_answers()
print ('xmlr.xmlparse using xml.etree.cElementTree') doc = xmlparse("/home/hbldh/Downloads/google-renewals-all-20080624.xml", XMLParsingMethods.C_ELEMENTTREE) print('Size in MB: {0:.2f} MB'.format(document_size(doc)/1024./1024.)) del doc print ('xmlr.xmlparse using lxml.etree') doc = xmlparse("/home/hbldh/Downloads/google-renewals-all-20080624.xml", XMLParsingMethods.LXML_ELEMENTTREE) print('Size in MB: {0:.2f} MB'.format(document_size(doc)/1024./1024.)) del doc # xmliter print ('xmlr.xmliter using xml.etree.ElementTree') docs = [] for d in xmliter("/home/hbldh/Downloads/google-renewals-all-20080624.xml", "Record", XMLParsingMethods.ELEMENTTREE): docs.append(d) print('Size in MB: {0:.2f} MB'.format(document_size(docs)/1024./1024.)) del docs print ('xmlr.xmliter using xml.etree.cElementTree') docs = [] for d in xmliter("/home/hbldh/Downloads/google-renewals-all-20080624.xml", "Record", XMLParsingMethods.C_ELEMENTTREE): docs.append(d) print('Size in MB: {0:.2f} MB'.format(document_size(docs)/1024./1024.)) del docs print ('xmlr.xmliter using lxml.etree') docs = [] for d in xmliter("/home/hbldh/Downloads/google-renewals-all-20080624.xml", "Record", XMLParsingMethods.LXML_ELEMENTTREE): docs.append(d)
doc = xmlparse("/home/hbldh/Downloads/google-renewals-all-20080624.xml", XMLParsingMethods.C_ELEMENTTREE) print('Size in MB: {0:.2f} MB'.format(document_size(doc) / 1024. / 1024.)) del doc print('xmlr.xmlparse using lxml.etree') doc = xmlparse("/home/hbldh/Downloads/google-renewals-all-20080624.xml", XMLParsingMethods.LXML_ELEMENTTREE) print('Size in MB: {0:.2f} MB'.format(document_size(doc) / 1024. / 1024.)) del doc # xmliter print('xmlr.xmliter using xml.etree.ElementTree') docs = [] for d in xmliter("/home/hbldh/Downloads/google-renewals-all-20080624.xml", "Record", XMLParsingMethods.ELEMENTTREE): docs.append(d) print('Size in MB: {0:.2f} MB'.format(document_size(docs) / 1024. / 1024.)) del docs print('xmlr.xmliter using xml.etree.cElementTree') docs = [] for d in xmliter("/home/hbldh/Downloads/google-renewals-all-20080624.xml", "Record", XMLParsingMethods.C_ELEMENTTREE): docs.append(d) print('Size in MB: {0:.2f} MB'.format(document_size(docs) / 1024. / 1024.)) del docs print('xmlr.xmliter using lxml.etree') docs = [] for d in xmliter("/home/hbldh/Downloads/google-renewals-all-20080624.xml",
sodatadb = activeclient.fdac18stackoverflow tagcol = sodatadb.tags postcol = sodatadb.posts commentcol = sodatadb.comments neededtags = [ 'reactjs', 'angularjs', 'vue.js', 'vuejs2', 'ember.js', 'jquery', 'backbone.js' ] neededtagset = set(neededtags) xmldir = '/data/NPMDependencies/stackoverflowdata/' tagfile = xmldir + 'Tags.xml' postfile = xmldir + 'Posts.xml' commentfile = xmldir + 'Comments.xml' tagXML = untangle.parse(tagfile) for tag in tagXML.tags.children: mongotag = TagDocument(tag) tagcol.insert_one(mongotag.insertable) for post in xmliter(postfile, "row"): mongopost = PostDocument(post) posttags = set(mongopost.insertable['Tags']) if neededtagset.intersection(posttags): postcol.insert_one(mongopost.insertable) for comment in xmliter(commentfile, "row"): mongocomment = CommentDocument(comment) commentcol.insert_one(mongocomment.insertable)