Esempio n. 1
0
def read_irsa_query(infile):
    #print("Reading " + infile)
    lstValue = []
    i = 0

    data = infile
    year = [i for i in xmliter(data, "year")][0]
    day = [i for i in xmliter(data, "day")][0]

    for d in tqdm(xmliter(data, "statistics")):
        d['zody'] = float(d['zody'].replace("(MJy/sr)", ""))
        d['cib'] = float(d['cib'].replace("(MJy/sr)", ""))
        d['stars'] = float(d['stars'].replace("(MJy/sr)", ""))
        d['ism'] = float(d['ism'].replace("(MJy/sr)", ""))
        d['totbg'] = float(d['totbg'].replace("(MJy/sr)", ""))

        d['ra'] = float(d['refCoordinate'].split(" ")[0])
        d['dec'] = float(d['refCoordinate'].split(" ")[1])
        d.pop("refCoordinate")
        d['year'] = year
        d['day'] = day
        lstValue.append(d)
        i = i + 1

    return (lstValue)
Esempio n. 2
0
def test_parsing_note(xmldata_note, parser):
    docs = []
    for doc in xmliter(xmldata_note, 'note', parsing_method=parser):
        assert isinstance(doc, dict)
        docs.append(doc)
        walk_test(doc)
    assert len(docs)
Esempio n. 3
0
def test_parsing_note(xmldata_note, parser):
    docs = []
    for doc in xmliter(xmldata_note, 'note', parsing_method=parser):
        assert isinstance(doc, dict)
        docs.append(doc)
        walk_test(doc)
    assert len(docs)
Esempio n. 4
0
def test_parsing_plants(xmldata_plants, parser):
    docs = []
    for doc in xmliter(xmldata_plants, 'PLANT', parsing_method=parser):
        assert isinstance(doc, dict)
        docs.append(doc)
        walk_test(doc)
    assert len(docs) == 36
Esempio n. 5
0
def test_parsing_menu(xmldata_menu, parser):
    docs = []
    for doc in xmliter(xmldata_menu, 'food', parsing_method=parser):
        assert isinstance(doc, dict)
        docs.append(doc)
        walk_test(doc)
    assert len(docs) == 5
Esempio n. 6
0
def test_parsing_menu(xmldata_menu, parser):
    docs = []
    for doc in xmliter(xmldata_menu, 'food', parsing_method=parser):
        assert isinstance(doc, dict)
        docs.append(doc)
        walk_test(doc)
    assert len(docs) == 5
Esempio n. 7
0
def test_parsing_cd(xmldata_cd, parser):
    docs = []
    for doc in xmliter(xmldata_cd, 'CD', parsing_method=parser):
        assert isinstance(doc, dict)
        docs.append(doc)
        walk_test(doc)
    assert len(docs) == 26
Esempio n. 8
0
def test_parsing_plants(xmldata_plants, parser):
    docs = []
    for doc in xmliter(xmldata_plants, 'PLANT', parsing_method=parser):
        assert isinstance(doc, dict)
        docs.append(doc)
        walk_test(doc)
    assert len(docs) == 36
Esempio n. 9
0
def test_parsing_cd(xmldata_cd, parser):
    docs = []
    for doc in xmliter(xmldata_cd, 'CD', parsing_method=parser):
        assert isinstance(doc, dict)
        docs.append(doc)
        walk_test(doc)
    assert len(docs) == 26
Esempio n. 10
0
 def parse(cls, file_path):
     """
     generator parser
     :param file_path: path to xml file
     :return: yields artist items
     """
     for data in xmliter(file_path, 'artist'):
         yield cls(data)
Esempio n. 11
0
def test_parsing_test_doc(parser):
    f = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                     'test_doc.xml')
    docs = []
    for doc in xmliter(f, 'AnItem', parsing_method=parser):
        assert isinstance(doc, dict)
        docs.append(doc)
        walk_test(doc)
    assert len(docs) == 3
Esempio n. 12
0
def test_parsing_test_doc(parser):
    f = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                     'test_doc.xml')
    docs = []
    for doc in xmliter(f, 'AnItem', parsing_method=parser):
        assert isinstance(doc, dict)
        docs.append(doc)
        walk_test(doc)
    assert len(docs) == 3
Esempio n. 13
0
    def parse(cls, file_path):
        """
        generator parser
        :param file_path: path to xml file
        :return: yields Release items
        """

        for data in xmliter(file_path, 'release'):
            yield cls(data)
Esempio n. 14
0
def test_parsing_google_renewal_data_1(parser):
    f = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                     'google-renewals-subset-20080624.xml')
    docs = []
    for doc in xmliter(f, 'Record', parsing_method=parser):
        assert isinstance(doc, dict)
        docs.append(doc)
        walk_test(doc)
    assert len(docs) == 4
Esempio n. 15
0
def test_parsing_google_renewal_data_1(parser):
    f = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                     'google-renewals-subset-20080624.xml')
    docs = []
    for doc in xmliter(f, 'Record', parsing_method=parser):
        assert isinstance(doc, dict)
        docs.append(doc)
        walk_test(doc)
    assert len(docs) == 4
Esempio n. 16
0
 def read_badges(xml_badges_file_path):
     map_user_badges = {}
     if xml_badges_file_path is not None:
         for attr_dic in xmliter(xml_badges_file_path, 'row'):
             user_id = int(attr_dic["@UserId"])
             class_type = int((attr_dic["@Class"][0]))
             date = (attr_dic["@Date"])
             if user_id in map_user_badges:
                 map_user_badges[user_id].append((class_type, date))
             else:
                 map_user_badges[user_id] = [(class_type, date)]
     return map_user_badges
Esempio n. 17
0
    def __init__(self, xml_user_file_path, xml_badges_file_path):
        self.map_of_user = {}
        map_user_badges = UserParserRecord.read_badges(xml_badges_file_path)

        for attr_dic in xmliter(xml_user_file_path, 'row'):
            user_id = int(attr_dic["@Id"])
            creation_date = None
            age = None
            location = None
            reputation = None
            views = None
            about_me = None
            up_votes = None
            down_votes = None
            website_url = None
            last_access_date = None
            display_name = None

            if "@CreationDate" in attr_dic:
                creation_date = (attr_dic["@CreationDate"])
            if "@Age" in attr_dic:
                age = int(attr_dic["@Age"])
            if "@Location" in attr_dic:
                location = (attr_dic["@Location"])
            if "@Reputation" in attr_dic:
                reputation = int(attr_dic["@Reputation"])
            if "@Views" in attr_dic:
                last_access_date = int(attr_dic["@Views"])
            if "@WebsiteUrl" in attr_dic:
                website_url = (attr_dic["@WebsiteUrl"])
            if "@DownVotes" in attr_dic:
                down_votes = int(attr_dic["@DownVotes"])
            if "@UpVotes" in attr_dic:
                up_votes = int(attr_dic["@UpVotes"])
            if "@AboutMe" in attr_dic:
                about_me = (attr_dic["@AboutMe"])
            if "@LastAccessDate" in attr_dic:
                last_access_date = (attr_dic["@LastAccessDate"])
            if "@DisplayName" in attr_dic:
                display_name = (attr_dic["@DisplayName"])
            lst_badges = None
            if user_id in map_user_badges:
                lst_badges = map_user_badges[user_id]

            user = User(user_id, reputation, age, location, creation_date,
                        views, lst_badges, about_me, up_votes, down_votes,
                        website_url, last_access_date, display_name)
            self.map_of_user[user_id] = user
Esempio n. 18
0
    def __init__(self, xml_post_link_file_path):
        self.map_duplicate_posts = {}
        self.map_related_posts = {}
        for attr_dic in xmliter(xml_post_link_file_path, 'row'):
            post_id = int(attr_dic["@PostId"])
            related_post_id = int(attr_dic["@RelatedPostId"])
            link_type_id = int(attr_dic["@LinkTypeId"])

            if link_type_id == 3:  # Duplicate
                if post_id in self.map_duplicate_posts:
                    self.map_duplicate_posts[post_id].append(related_post_id)
                else:
                    self.map_duplicate_posts[post_id] = [related_post_id]
            elif link_type_id == 1:  # Related
                if post_id in self.map_related_posts:
                    self.map_related_posts[post_id].append(related_post_id)
                else:
                    self.map_related_posts[post_id] = [related_post_id]
Esempio n. 19
0
def getLines(InFileName, outfileName, maxLines, startdate=None, enddate=None):

    f = open(outfileName, 'w', encoding='utf8')
    f.write("<HTML><BODY>\n")

    count = 0
    if maxLines == None:
        maxLines = -1

    DateStart = None
    DateEnd = None
    if (startdate != None):
        DateStart = datetime.datetime.strptime(startdate, "%Y-%m-%d")
    if (enddate != None):
        DateEnd = datetime.datetime.strptime(enddate, "%Y-%m-%d")

    for d in xmliter(InFileName, 'item'):
        if maxLines == -1 or count < maxLines:
            if (d['{http://wordpress.org/export/1.2/}post_type'] == 'post'):
                if (d['title'] != None):
                    title = "<H2>" + d['title'] + "</h2>"
                    if (d['pubDate'] != None):

                        date_time_obj = datetime.datetime.strptime(
                            d['pubDate'], "%a, %d %b %Y %H:%M:%S %z")
                        date_time_obj = date_time_obj.replace(tzinfo=None)
                        if ((DateStart != None) and
                            (DateStart < date_time_obj)) or (DateStart
                                                             == None):
                            if ((DateStart != None) and
                                (DateEnd > date_time_obj)) or (DateEnd
                                                               == None):
                                f.write(title + "\n")
                                dateStr = '<b>' + date_time_obj.date(
                                ).strftime("%Y-%m-%d") + '</b>'
                                f.write(dateStr + "\n")
                                itemString = "<p>" + d[
                                    '{http://purl.org/rss/1.0/modules/content/}encoded'].replace(
                                        '\n\n', '</p><p>\n') + '</p>'
                                f.write(itemString + "\n")
                                count = count + 1

    f.write('\n</BODY></HTML>')
    f.close()
Esempio n. 20
0
 def __init__(self, xml_vote_file_path):
     self.map_of_votes = {}
     for attr_dic in xmliter(xml_vote_file_path, 'row'):
         id = int(attr_dic["@Id"])
         post_id = int(attr_dic["@PostId"])
         vote_type_id = int(attr_dic["@VoteTypeId"])
         user_id = None
         bounty_amount = None
         creation_date = None
         if "@UserId" in attr_dic:
             user_id = int(attr_dic["@UserId"])
         if "@BountyAmount" in attr_dic:
             bounty_amount = int(attr_dic["@BountyAmount"])
         if "@CreationDate" in attr_dic:
             creation_date = attr_dic["@CreationDate"]
         vote = Vote(id, post_id, vote_type_id, user_id, creation_date, bounty_amount)
         if post_id in self.map_of_votes:
             self.map_of_votes[post_id].append(vote)
         else:
             self.map_of_votes[post_id] = [vote]
 def __init__(self, xml_post_history_file_path):
     self.map_of_edits = {}
     for attr_dic in xmliter(xml_post_history_file_path, 'row'):
         history_id = int(attr_dic["@Id"])
         post_id = int(attr_dic["@PostId"])
         post_history_type_id = None
         revision_guid = None
         user_display_name = None
         text = None
         creation_date = None
         comment = None
         user_id = None
         close_reason_id = None
         if "@RevisionGUID" in attr_dic:
             revision_guid = attr_dic["@RevisionGUID"]
         if "@PostHistoryTypeId" in attr_dic:
             post_history_type_id = int(attr_dic["@PostHistoryTypeId"])
         if "@Comment" in attr_dic:
             comment = (attr_dic["@Comment"])
         if "@UserDisplayName" in attr_dic:
             user_display_name = (attr_dic["@UserDisplayName"])
         if "@CloseReasonId" in attr_dic:
             close_reason_id = int(attr_dic["@CloseReasonId"])
         if "@UserId" in attr_dic:
             user_id = int(attr_dic["@UserId"])
         if "@CreationDate" in attr_dic:
             creation_date = (attr_dic["@CreationDate"])
         if "@Text" in attr_dic:
             text = (attr_dic["@Text"])
         post_history = PostHistory(history_id, post_id,
                                    post_history_type_id, revision_guid,
                                    creation_date, user_id,
                                    user_display_name, comment, text,
                                    close_reason_id)
         if post_id in self.map_of_edits:
             self.map_of_edits[post_id].append(post_history)
         else:
             self.map_of_edits[post_id] = [post_history]
    def read_file(self, input_file):

        line_to_add_begining_of_row = "<?xml version=\"1.0\" encoding=\"utf-8\"?>" + "\n" + "<posts>"
        line_to_add_ending_of_row = "</posts>"
        #print question_input_file

        for line in open(input_file, 'r'):
            # print(line)
            xml_doc = line_to_add_begining_of_row + line + line_to_add_ending_of_row
            temp_xml = "temp_xml.xml"
            f_temp = open(temp_xml, 'w')
            f_temp.write(xml_doc)
            f_temp.flush()
            f_temp.close()

            for d in xmliter(temp_xml, 'row'):
                post_id = d['@Id'].encode("utf-8").strip()
                post_id = str(post_id.decode("utf-8"))

                post_type_id = d['@PostTypeId']

                if post_type_id == "2":
                    question_id = d['@ParentId'].encode(
                        "utf-8").strip().decode("utf-8")
                    # print(question_id)

                    print("now processing answer with : ", post_id,
                          ", from question with id: ", question_id)
                    post_id = question_id + "_" + post_id

                else:
                    print("now processing question with id: ", post_id)

                body = d['@Body']
                body_xml_filtered = self.Extract_Text_From_XML(body)

                annotated_tokenized_text = self.tokenize_and_annotae_post_body(
                    body_xml_filtered, post_id)
Esempio n. 23
0
    def __init__(self, xml_comment_file_path):

        self.map_of_comments_for_post = {}
        for attr_dic in xmliter(xml_comment_file_path, 'row'):
            comment_id = int(attr_dic["@Id"])
            post_id = int(attr_dic["@PostId"])
            text = (attr_dic["@Text"])
            creation_date = None
            score = None
            user_id = None

            if "@Score" in attr_dic:
                score = int(attr_dic["@Score"])
            if "@UserId" in attr_dic:
                user_id = int(attr_dic["@UserId"])
            if "@CreationDate" in attr_dic:
                creation_date = (attr_dic["@CreationDate"])

            comment = Comment(comment_id, post_id, text, score, user_id,
                              creation_date)
            if post_id in self.map_of_comments_for_post:
                self.map_of_comments_for_post[post_id].append(comment)
            else:
                self.map_of_comments_for_post[post_id] = [comment]
Esempio n. 24
0
def test_parsing_note_error(xmldata_note_error, parser):
    with pytest.raises((ParseError, cParseError, XMLSyntaxError), parsing_method=parser):
        for doc in xmliter(xmldata_note_error, 'note', parsing_method=parser):
            pass
Esempio n. 25
0
from xmlr import xmliter

input_xml = 'topics.arqmath-2021-task2.origin.xml'
output_txt = 'topics.arqmath-2021-task2.txt'

with open(output_txt, 'w') as fh:
    for attrs in xmliter(input_xml, 'Topic'):
        qid = attrs['@number']
        latex = attrs['Latex']
        print(f'{qid}\t{latex}', file=fh)
Esempio n. 26
0
def test_parsing_note_error(xmldata_note_error, parser):
    with pytest.raises((ParseError, cParseError, XMLSyntaxError),
                       parsing_method=parser):
        for doc in xmliter(xmldata_note_error, 'note', parsing_method=parser):
            pass
Esempio n. 27
0
def read_euclid_mission_plan(data):
    pointValue = []
    obsValue = []
    i = 0

    print("Reading pointing requests")
    for d in tqdm(xmliter(data, "ObservationRequest")):

        if not isinstance(d["PointingRequest"], list):
            d["PointingRequest"] = [d["PointingRequest"]]

        for i in d["PointingRequest"]:
            #print(i)
            i["ObservationType"] = d["ObservationType"]
            i["MissionPhase"] = d["MissionPhase"]
            i["SurveyId"] = d["SurveyId"]

        pointValue = pointValue + d["PointingRequest"]
    db = pd.DataFrame(pointValue)

    nrows = len(db.iloc[:, 0])
    db_small = pd.DataFrame(index=np.arange(nrows),
                            columns=[
                                'ID', 'MissionPhase', 'ObservationType',
                                'SurveyId', 'MJD2000', 'StartTime', 'Year',
                                'Day_year', 'Lon', 'Lat', 'RA', 'DEC', 'PA',
                                "exptime", "expstart"
                            ])

    db_small["MissionPhase"] = db.iloc[:, ]["MissionPhase"]
    db_small["ObservationType"] = db.iloc[:, ]["ObservationType"]
    db_small["SurveyId"] = db.iloc[:, ]["SurveyId"]

    db_small["Lon"] = np.array(
        [float(i["Longitude"]) for i in db.iloc[:, ]["Attitude"]])
    db_small["Lat"] = np.array(
        [float(i["Latitude"]) for i in db.iloc[:, ]["Attitude"]])
    db_small["PA"] = np.array(
        [float(i["PositionAngle"]) for i in db.iloc[:, ]["Attitude"]])

    print("Transforming coordinates...")
    gc = SkyCoord(lon=db_small["Lon"] * u.degree,
                  lat=db_small["Lat"] * u.degree,
                  frame='barycentrictrueecliptic')

    db_small["ID"] = db.iloc[:, 0]
    db_small["MJD2000"] = np.array(db.iloc[:, ]["Mjd2000"]).astype("float")
    db_small["StartTime"] = db.iloc[:, ]["StartTime"]

    #    t = [Time(db_small["StartTime"], format='isot', scale='utc') for
    print("Time dates reshaping...")
    t = [Time(i, format='isot', scale='utc') for i in db_small["StartTime"]]
    db_small["Year"] = np.array([i.datetime.year for i in t])
    #    tt = t.datetime.timetuple() # We use tm_yday transforming t to tt (tuple time)
    tt = [i.datetime.timetuple() for i in t]
    db_small["Day_year"] = np.array([i.tm_yday for i in tt])
    db_small["RA"] = gc.icrs.ra.degree
    db_small["DEC"] = gc.icrs.dec.degree
    db_small["exptime"] = np.array(db.iloc[:, ]["Duration"]).astype("float")
    db_small["expstart"] = t

    # Now we add the planets positions in the sky.
    planets_position = [read_ephemerides(i) for i in planets_list]
    for i in range(len(planets_list)):
        print(planets_list[i])
        ra_temp, dec_temp = position_planet(ephemeris=planets_position[i],
                                            time=db_small["expstart"][:],
                                            time_zero=db_small["expstart"][0])
        db_small["ra_" + planets_list[i].lower()] = ra_temp
        db_small["dec_" + planets_list[i].lower()] = dec_temp

    print("End of line")
    return (db_small)
import pandas as pd
from detectron2.structures import BoxMode

from xmlr import xmlparse
from xmlr import xmliter
from xmlr import xmliter, XMLParsingMethods
import xml.etree.ElementTree
img_dir = "/mnt/dst_datasets/own_omni_dataset/theodore_v3/images/"
import cv2
from matplotlib import pyplot as plt
count = 0

df_cols = ["id", "name", 'xtl', 'ytl', 'xbr', 'ybr', "action_label", "grp_id"]
rows = []
for d in xmliter(
        '/mnt/dst_datasets/own_omni_dataset/theodore_v3/theodore_plus_training.xml',
        'image'):
    if count == 45000:
        record = {}
        boxes = []
        grp = []
        grp_id = []
        actions = []
        for k, v in d.items():
            if k == 'actions':
                for key, val in v.items():
                    for a in val:
                        actions.append(a["@name"])
                        grp_id.append(a["@group_id"])
            if k == 'box':
                if type(v) is list:
    def __init__(self,
                 xml_post_link_file_path,
                 map_comments=None,
                 map_related_post=None,
                 map_duplicate_post=None,
                 map_votes=None,
                 map_users=None,
                 post_history_parser=None):
        self.map_questions = {}
        self.map_answers = {}
        self.map_just_answers = {}
        for attr_dic in xmliter(xml_post_link_file_path, 'row'):
            post_id = int(attr_dic['@Id'])
            post_type_id = int(attr_dic['@PostTypeId'])
            creation_date = (attr_dic["@CreationDate"])
            body = (attr_dic["@Body"])
            view_count = None
            comment_count = None
            owner_user_id = None
            last_edit_date = None
            last_activity_date = None
            last_editor_user_id = None
            community_owned_date = None
            last_editor_display_name = None
            score = None
            user = None

            if "@ViewCount" in attr_dic:
                view_count = int(attr_dic["@ViewCount"])
            if "@Score" in attr_dic:
                score = int(attr_dic["@Score"])
            if "@CommentCount" in attr_dic:
                comment_count = int(attr_dic["@CommentCount"])
            if "@OwnerUserId" in attr_dic:
                owner_user_id = int(attr_dic["@OwnerUserId"])
                if map_users is not None and owner_user_id in map_users:
                    user = map_users[owner_user_id]
            if "@LastEditDate" in attr_dic:
                last_edit_date = (attr_dic["@LastEditDate"])
            if "@LastActivityDate" in attr_dic:
                last_activity_date = (attr_dic["@LastActivityDate"])
            if "@LastEditorUserId" in attr_dic:
                last_editor_user_id = int(attr_dic["@LastEditorUserId"])
            if "@CommunityOwnedDate" in attr_dic:
                community_owned_date = (attr_dic["@CommunityOwnedDate"])
            if "@LastEditorDisplayName" in attr_dic:
                last_editor_display_name = (attr_dic["@LastEditorDisplayName"])

            comment_list = None
            vote_list = None
            edit_list = None
            if map_comments is not None and post_id in map_comments:
                comment_list = map_comments[post_id]
            if map_votes is not None and post_id in map_votes:
                vote_list = map_votes[post_id]
            if post_history_parser is not None and post_id in post_history_parser.map_of_edits:
                edit_list = post_history_parser.map_of_edits[post_id]

            if post_type_id == 1:  # Question
                title = (attr_dic["@Title"])
                favourite_count = None
                closed_date = None
                accepted_answer_id = None
                related_post = []

                if map_related_post is not None and post_id in map_related_post:
                    for related_post_id in map_related_post[post_id]:
                        related_post.append((related_post_id, False))

                if map_duplicate_post is not None and post_id in map_duplicate_post:
                    for related_post_id in map_duplicate_post[post_id]:
                        related_post.append((related_post_id, True))

                if "@CommentCount" in attr_dic:
                    comment_count = int(attr_dic["@CommentCount"])
                if "@AnswerCount" in attr_dic:
                    answer_count = int(attr_dic["@AnswerCount"])
                if "@FavoriteCount" in attr_dic:
                    favourite_count = int(attr_dic["@FavoriteCount"])
                if "@AcceptedAnswerId" in attr_dic:
                    accepted_answer_id = int(attr_dic["@AcceptedAnswerId"])
                if "@ClosedDate" in attr_dic:
                    closed_date = (attr_dic["@ClosedDate"])
                if "@Tags" in attr_dic:
                    tags = (attr_dic["@Tags"]).split(">")
                    lst_tags = []
                    for i in range(0, len(tags) - 1):
                        tag = tags[i][1:]
                        lst_tags.append(tag)
                self.map_questions[post_id] = Question(
                    post_id, creation_date, score, view_count, body,
                    owner_user_id, comment_count, last_edit_date,
                    last_activity_date, last_editor_user_id,
                    community_owned_date, last_editor_display_name,
                    related_post, comment_list, vote_list, edit_list, user,
                    title, lst_tags, accepted_answer_id, answer_count,
                    favourite_count, closed_date)

            elif post_type_id == 2:
                parent_id = int(attr_dic["@ParentId"])
                answer = Answer(post_id, creation_date, score, view_count,
                                body, owner_user_id, comment_count,
                                last_edit_date, last_activity_date,
                                last_editor_user_id, community_owned_date,
                                last_editor_display_name, parent_id,
                                comment_list, vote_list, edit_list, user)

                if parent_id in self.map_answers:
                    self.map_answers[parent_id].append(answer)
                else:
                    self.map_answers[parent_id] = [answer]
                self.map_just_answers[answer.post_id] = answer
        self.__set_answers()
Esempio n. 30
0
print ('xmlr.xmlparse using xml.etree.cElementTree')
doc = xmlparse("/home/hbldh/Downloads/google-renewals-all-20080624.xml", XMLParsingMethods.C_ELEMENTTREE)
print('Size in MB: {0:.2f} MB'.format(document_size(doc)/1024./1024.))
del doc

print ('xmlr.xmlparse using lxml.etree')
doc = xmlparse("/home/hbldh/Downloads/google-renewals-all-20080624.xml", XMLParsingMethods.LXML_ELEMENTTREE)
print('Size in MB: {0:.2f} MB'.format(document_size(doc)/1024./1024.))
del doc

# xmliter

print ('xmlr.xmliter using xml.etree.ElementTree')
docs = []
for d in xmliter("/home/hbldh/Downloads/google-renewals-all-20080624.xml", "Record", XMLParsingMethods.ELEMENTTREE):
    docs.append(d)
print('Size in MB: {0:.2f} MB'.format(document_size(docs)/1024./1024.))
del docs

print ('xmlr.xmliter using xml.etree.cElementTree')
docs = []
for d in xmliter("/home/hbldh/Downloads/google-renewals-all-20080624.xml", "Record", XMLParsingMethods.C_ELEMENTTREE):
    docs.append(d)
print('Size in MB: {0:.2f} MB'.format(document_size(docs)/1024./1024.))
del docs

print ('xmlr.xmliter using lxml.etree')
docs = []
for d in xmliter("/home/hbldh/Downloads/google-renewals-all-20080624.xml", "Record", XMLParsingMethods.LXML_ELEMENTTREE):
    docs.append(d)
Esempio n. 31
0
doc = xmlparse("/home/hbldh/Downloads/google-renewals-all-20080624.xml",
               XMLParsingMethods.C_ELEMENTTREE)
print('Size in MB: {0:.2f} MB'.format(document_size(doc) / 1024. / 1024.))
del doc

print('xmlr.xmlparse using lxml.etree')
doc = xmlparse("/home/hbldh/Downloads/google-renewals-all-20080624.xml",
               XMLParsingMethods.LXML_ELEMENTTREE)
print('Size in MB: {0:.2f} MB'.format(document_size(doc) / 1024. / 1024.))
del doc

# xmliter

print('xmlr.xmliter using xml.etree.ElementTree')
docs = []
for d in xmliter("/home/hbldh/Downloads/google-renewals-all-20080624.xml",
                 "Record", XMLParsingMethods.ELEMENTTREE):
    docs.append(d)
print('Size in MB: {0:.2f} MB'.format(document_size(docs) / 1024. / 1024.))
del docs

print('xmlr.xmliter using xml.etree.cElementTree')
docs = []
for d in xmliter("/home/hbldh/Downloads/google-renewals-all-20080624.xml",
                 "Record", XMLParsingMethods.C_ELEMENTTREE):
    docs.append(d)
print('Size in MB: {0:.2f} MB'.format(document_size(docs) / 1024. / 1024.))
del docs

print('xmlr.xmliter using lxml.etree')
docs = []
for d in xmliter("/home/hbldh/Downloads/google-renewals-all-20080624.xml",
Esempio n. 32
0
sodatadb = activeclient.fdac18stackoverflow
tagcol = sodatadb.tags
postcol = sodatadb.posts
commentcol = sodatadb.comments

neededtags = [
    'reactjs', 'angularjs', 'vue.js', 'vuejs2', 'ember.js', 'jquery',
    'backbone.js'
]
neededtagset = set(neededtags)

xmldir = '/data/NPMDependencies/stackoverflowdata/'
tagfile = xmldir + 'Tags.xml'
postfile = xmldir + 'Posts.xml'
commentfile = xmldir + 'Comments.xml'

tagXML = untangle.parse(tagfile)
for tag in tagXML.tags.children:
    mongotag = TagDocument(tag)
    tagcol.insert_one(mongotag.insertable)

for post in xmliter(postfile, "row"):
    mongopost = PostDocument(post)
    posttags = set(mongopost.insertable['Tags'])
    if neededtagset.intersection(posttags):
        postcol.insert_one(mongopost.insertable)

for comment in xmliter(commentfile, "row"):
    mongocomment = CommentDocument(comment)
    commentcol.insert_one(mongocomment.insertable)