Esempio n. 1
0
def create_users():
    session = Session()
    pgs = session.query(Page).filter(Page.title.like('User:%'))
    counter = 0
    for page in pgs:
        if '/' in page.title:
            continue
        user = User.make_user_from_page(page, session)
        session.add(user)
        counter += 1
        if counter % 1000 == 0:
            print(counter)
            session.commit()
    session.commit()
Esempio n. 2
0
def create_users():
    session = Session()
    pgs = session.query(Page).filter(Page.title.like('User:%'))
    counter = 0
    for page in pgs:
        if '/' in page.title:
            continue
        user = User.make_user_from_page(page, session)
        session.add(user)
        counter += 1
        if counter % 1000 == 0:
            print(counter)
            session.commit()
    session.commit()
Esempio n. 3
0
def create_comments():
    session = Session()
    pages = session.query(Page).filter(Page.title.like('Talk:%'))
    counter = 0
    for page in pages:
        Comment.from_page(page, session)
        if counter % 200 == 0:
            print(counter)
            session.commit()
        counter += 1
    session.commit()
Esempio n. 4
0
def create_language_proficiencies():
    session = Session()
    users = session.query(User).all()
    counter = 0
    for user in users:
        user.set_proficiencies()
        if counter % 200 == 0:
            print(counter)
            session.commit()
        counter += 1
    session.commit()
Esempio n. 5
0
def load_xml():
    session = Session()
    Base.metadata.create_all(engine)
    xml_file = open(
        '../../wiktionary_data/enwiktionary-20120220-pages-meta-current.xml')
    xml_parser = XMLPageParser(xml_file, enPage)
    total_lines = 0
    counter = 0
    for page in xml_parser:
        ev = session.query(Page).filter(Page.language == page.language,
                                        Page.title == page.title)
        if ev.count() == 0:
            session.add(page)
        if counter % 1000 == 0:
            print(counter)
            session.commit()
            print('committed')
        counter += 1
    session.commit()
Esempio n. 6
0
def load_xml():
    session = Session()
    Base.metadata.create_all(engine)
    xml_file = open('../../wiktionary_data/enwiktionary-20120220-pages-meta-current.xml')
    xml_parser = XMLPageParser(xml_file, enPage)
    total_lines = 0
    counter = 0
    for page in xml_parser:
        ev = session.query(Page).filter(Page.language==page.language, Page.title==page.title)
        if ev.count() == 0:
            session.add(page)
        if counter % 1000 == 0:
            print(counter)
            session.commit()
            print('committed')
        counter += 1
    session.commit()
Esempio n. 7
0
def create_comments():
    session = Session()
    pages = session.query(Page).filter(Page.title.like('Talk:%'))
    counter = 0
    for page in pages:
        Comment.from_page(page, session)
        if counter % 200 == 0:
            print(counter)
            session.commit()
        counter += 1
    session.commit()
Esempio n. 8
0
def create_language_proficiencies():
    session = Session()
    users = session.query(User).all()
    counter = 0
    for user in users:
        user.set_proficiencies()
        if counter % 200 == 0:
            print(counter)
            session.commit()
        counter += 1
    session.commit()
Esempio n. 9
0
 def set_proficiencies(self):
     session = Session.object_session(self)
     pattern = re.compile(
         '\{\{Babel(\-13)?\|(?P<content>[a-zA-Z\-0-9\\|]+)}\}')
     if not self.page.text:
         matches = []
     else:
         matches = re.findall(pattern, self.page.text)
     bits = []
     for match in matches:
         content = match[1]
         bits += content.split('|')
     old_lps = self.language_proficiencies
     continuing_lps = []
     new_lps = []
     for bit in bits:
         if not bit:
             continue
         # Work out the language and proficiency.
         if bit[-1] in '0123456789':
             proficiency = int(bit[-1])
             language = bit[:-2]
         else:
             proficiency = LanguageProficiency.NATIVE
             language = bit
         old_match = None
         # Check whether there already exists a record for this.
         for lp in old_lps:
             if lp.language == language and lp.proficiency == proficiency:
                 old_match = lp
                 break
         # Make a not if it exists.
         if old_match:
             continuing_lps.append(old_match)
         # And add it if it doesn't
         else:
             new_lps.append(LanguageProficiency(self, language,
                                                proficiency))
     deleted_lps = list(set(old_lps) - set(continuing_lps))
     for lp in deleted_lps:
         import pdb
         pdb.set_trace()
         session.delete(lp)
     for lp in new_lps:
         session.add(lp)
Esempio n. 10
0
 def set_proficiencies(self):
     session = Session.object_session(self)
     pattern = re.compile('\{\{Babel(\-13)?\|(?P<content>[a-zA-Z\-0-9\\|]+)}\}')
     if not self.page.text:
         matches = []
     else:
         matches = re.findall(pattern, self.page.text)
     bits = []
     for match in matches:
         content = match[1]
         bits += content.split('|')
     old_lps = self.language_proficiencies
     continuing_lps = []
     new_lps = []
     for bit in bits:
         if not bit:
             continue
         # Work out the language and proficiency.
         if bit[-1] in '0123456789':
             proficiency = int(bit[-1])
             language = bit[:-2]
         else:
             proficiency = LanguageProficiency.NATIVE
             language = bit
         old_match = None
         # Check whether there already exists a record for this.
         for lp in old_lps:
             if lp.language == language and lp.proficiency == proficiency:
                 old_match = lp
                 break
         # Make a not if it exists.
         if old_match:
             continuing_lps.append(old_match)
         # And add it if it doesn't
         else:
             new_lps.append(LanguageProficiency(self, language, proficiency))
     deleted_lps = list(set(old_lps) - set(continuing_lps))
     for lp in deleted_lps:
         import pdb
         pdb.set_trace()
         session.delete(lp)
     for lp in new_lps:
         session.add(lp)
Esempio n. 11
0
 def user(self):
     session = Session.object_session(self)
     user = session.query(User).get((self.user_username, self.language))
     return user
Esempio n. 12
0
 def page(self):
     session = Session.object_session(self)
     page = session.query(Page).get((self.page_title, self.language))
     return page
Esempio n. 13
0
 def page(self):
     session = Session.object_session(self)
     page = session.query(Page).get((self.page_title, self.language))
     return page
Esempio n. 14
0
 def user(self):
     session = Session.object_session(self)
     user = session.query(User).get((self.user_username, self.language))
     return user
Esempio n. 15
0
 def parse(self, shallow=False):
     super(simpleWordTypeSection, self).parse()
     l2bs = list(Chopper(self.text, [
         Level2Block,
     ]))
     if len(l2bs) != 1:
         raise ParsingError()
     title = l2bs[0].start_tag
     content = l2bs[0].text
     wordtype_title_sec = simpleWordTypeTitleSection(text=title,
                                                     parent=self).parse()
     wordtype = self.get_property('wordtype')
     # If we don't get a recognisable word type then we can't parse this section.
     if wordtype not in level2_mapping:
         page_title = self.get_property('page').title
         section = FillerSection(text=self.text, parent=self.parent)
         if wordtype in level3_mapping:
             # This should be a level 3 heading.
             message = "%s: The heading %s should be level 3 not level 2." % (
                 page_title, wordtype)
             fixed_text = u"===%s===%s" % (wordtype, content)
             alert = Level2_not_Level3(section, fixed_text, message,
                                       page_title)
         else:
             message = '%s: The word type "%s" is not known.' % (page_title,
                                                                 wordtype)
             alert = UnknownType(message=message, title=page_title)
         section.alerts.append(alert)
         return section
     # Get the Word Class associated with this type.
     word_class = level2_mapping[wordtype]
     # If there is no Word Class then this section can be ignored.
     if word_class is None:
         return FillerSection(text=self.text, parent=self.parent)
     # Otherwise create a new Word object.
     if word_class not in self.parent.wordtypes:
         self.parent.wordtypes[word_class] = 1
         order = 0
     else:
         order = self.parent.wordtypes[word_class]
         self.parent.wordtypes[word_class] += 1
     new_word = word_class.get_and_update(title=self.parent.title,
                                          order=order,
                                          session=Session.object_session(
                                              self.parent),
                                          tags=self.get_property('tags'))
     self.set_property('word', new_word)
     self.parent.words.append(new_word)
     if not wordtype_title_sec.readable():
         new_section = FillerSection(text=self.text,
                                     parent=self.parent,
                                     correct=False)
         return new_section.parse()
     self.children.append(wordtype_title_sec)
     for l3b in Chopper(content, [
             Level3Block,
     ],
                        filler_blocks=True,
                        include_tags=True):
         if isinstance(l3b, FillerBlock):
             section = simpleWordTypeHeaderSection(text=l3b.text,
                                                   parent=self)
         else:
             section = FillerSection(text=l3b.text,
                                     parent=self,
                                     correct=True)
         if not shallow:
             section = section.parse()
         self.children.append(section)
     return self