def create_users(): session = Session() pgs = session.query(Page).filter(Page.title.like('User:%')) counter = 0 for page in pgs: if '/' in page.title: continue user = User.make_user_from_page(page, session) session.add(user) counter += 1 if counter % 1000 == 0: print(counter) session.commit() session.commit()
def create_comments(): session = Session() pages = session.query(Page).filter(Page.title.like('Talk:%')) counter = 0 for page in pages: Comment.from_page(page, session) if counter % 200 == 0: print(counter) session.commit() counter += 1 session.commit()
def create_language_proficiencies(): session = Session() users = session.query(User).all() counter = 0 for user in users: user.set_proficiencies() if counter % 200 == 0: print(counter) session.commit() counter += 1 session.commit()
def load_xml(): session = Session() Base.metadata.create_all(engine) xml_file = open( '../../wiktionary_data/enwiktionary-20120220-pages-meta-current.xml') xml_parser = XMLPageParser(xml_file, enPage) total_lines = 0 counter = 0 for page in xml_parser: ev = session.query(Page).filter(Page.language == page.language, Page.title == page.title) if ev.count() == 0: session.add(page) if counter % 1000 == 0: print(counter) session.commit() print('committed') counter += 1 session.commit()
def load_xml(): session = Session() Base.metadata.create_all(engine) xml_file = open('../../wiktionary_data/enwiktionary-20120220-pages-meta-current.xml') xml_parser = XMLPageParser(xml_file, enPage) total_lines = 0 counter = 0 for page in xml_parser: ev = session.query(Page).filter(Page.language==page.language, Page.title==page.title) if ev.count() == 0: session.add(page) if counter % 1000 == 0: print(counter) session.commit() print('committed') counter += 1 session.commit()
def set_proficiencies(self): session = Session.object_session(self) pattern = re.compile( '\{\{Babel(\-13)?\|(?P<content>[a-zA-Z\-0-9\\|]+)}\}') if not self.page.text: matches = [] else: matches = re.findall(pattern, self.page.text) bits = [] for match in matches: content = match[1] bits += content.split('|') old_lps = self.language_proficiencies continuing_lps = [] new_lps = [] for bit in bits: if not bit: continue # Work out the language and proficiency. if bit[-1] in '0123456789': proficiency = int(bit[-1]) language = bit[:-2] else: proficiency = LanguageProficiency.NATIVE language = bit old_match = None # Check whether there already exists a record for this. for lp in old_lps: if lp.language == language and lp.proficiency == proficiency: old_match = lp break # Make a not if it exists. if old_match: continuing_lps.append(old_match) # And add it if it doesn't else: new_lps.append(LanguageProficiency(self, language, proficiency)) deleted_lps = list(set(old_lps) - set(continuing_lps)) for lp in deleted_lps: import pdb pdb.set_trace() session.delete(lp) for lp in new_lps: session.add(lp)
def set_proficiencies(self): session = Session.object_session(self) pattern = re.compile('\{\{Babel(\-13)?\|(?P<content>[a-zA-Z\-0-9\\|]+)}\}') if not self.page.text: matches = [] else: matches = re.findall(pattern, self.page.text) bits = [] for match in matches: content = match[1] bits += content.split('|') old_lps = self.language_proficiencies continuing_lps = [] new_lps = [] for bit in bits: if not bit: continue # Work out the language and proficiency. if bit[-1] in '0123456789': proficiency = int(bit[-1]) language = bit[:-2] else: proficiency = LanguageProficiency.NATIVE language = bit old_match = None # Check whether there already exists a record for this. for lp in old_lps: if lp.language == language and lp.proficiency == proficiency: old_match = lp break # Make a not if it exists. if old_match: continuing_lps.append(old_match) # And add it if it doesn't else: new_lps.append(LanguageProficiency(self, language, proficiency)) deleted_lps = list(set(old_lps) - set(continuing_lps)) for lp in deleted_lps: import pdb pdb.set_trace() session.delete(lp) for lp in new_lps: session.add(lp)
def user(self): session = Session.object_session(self) user = session.query(User).get((self.user_username, self.language)) return user
def page(self): session = Session.object_session(self) page = session.query(Page).get((self.page_title, self.language)) return page
def parse(self, shallow=False): super(simpleWordTypeSection, self).parse() l2bs = list(Chopper(self.text, [ Level2Block, ])) if len(l2bs) != 1: raise ParsingError() title = l2bs[0].start_tag content = l2bs[0].text wordtype_title_sec = simpleWordTypeTitleSection(text=title, parent=self).parse() wordtype = self.get_property('wordtype') # If we don't get a recognisable word type then we can't parse this section. if wordtype not in level2_mapping: page_title = self.get_property('page').title section = FillerSection(text=self.text, parent=self.parent) if wordtype in level3_mapping: # This should be a level 3 heading. message = "%s: The heading %s should be level 3 not level 2." % ( page_title, wordtype) fixed_text = u"===%s===%s" % (wordtype, content) alert = Level2_not_Level3(section, fixed_text, message, page_title) else: message = '%s: The word type "%s" is not known.' % (page_title, wordtype) alert = UnknownType(message=message, title=page_title) section.alerts.append(alert) return section # Get the Word Class associated with this type. word_class = level2_mapping[wordtype] # If there is no Word Class then this section can be ignored. if word_class is None: return FillerSection(text=self.text, parent=self.parent) # Otherwise create a new Word object. if word_class not in self.parent.wordtypes: self.parent.wordtypes[word_class] = 1 order = 0 else: order = self.parent.wordtypes[word_class] self.parent.wordtypes[word_class] += 1 new_word = word_class.get_and_update(title=self.parent.title, order=order, session=Session.object_session( self.parent), tags=self.get_property('tags')) self.set_property('word', new_word) self.parent.words.append(new_word) if not wordtype_title_sec.readable(): new_section = FillerSection(text=self.text, parent=self.parent, correct=False) return new_section.parse() self.children.append(wordtype_title_sec) for l3b in Chopper(content, [ Level3Block, ], filler_blocks=True, include_tags=True): if isinstance(l3b, FillerBlock): section = simpleWordTypeHeaderSection(text=l3b.text, parent=self) else: section = FillerSection(text=l3b.text, parent=self, correct=True) if not shallow: section = section.parse() self.children.append(section) return self