def _process_xml(): WIKIDICTIONARY = 2 en_labels = 0 ca_labels = 0 fr_labels = 0 de_labels = 0 es_labels = 0 it_labels = 0 ca_descs = 0 it_descs = 0 words = read_english_word_list() index = IndexCreator() index.open() authors = set() words_file_ca = open('words-ca.txt','w') descriptions_file_ca = open('descriptions-ca.txt','w') e = xml.etree.ElementTree.parse('cawiktionary-20160701-pages-meta-current.xml').getroot() for page in e.getchildren(): verb = False adverbi = False adjectiu = False en_label = u'' ca_label = u'' fr_label = u'' de_label = u'' es_label = u'' it_label = u'' for page_element in page.getchildren(): if 'title' in page_element.tag: ca_label = unicode(page_element.text) if 'revision' in page_element.tag: text = _get_revision_text(page_element) username = _get_username(page_element) if username is not None and len(username) > 0: authors.add(username) if text is not None: if '{{ca-verb' in text: verb = True elif '{{lema|ca|adv}}' in text: adverbi = True elif '{{ca-adj' in text: adjectiu = True if verb is True or adverbi is True or adjectiu is True: en_label = _get_translation(text, '{{trad|en|') es_label = _get_translation(text, '{{trad|es|') fr_label = _get_translation(text, '{{trad|fr|') de_label = _get_translation(text, '{{trad|de|') it_label = _get_translation(text, '{{trad|it|') username = _get_username(page_element) if username is not None and len(username) > 0: authors.add(username) if verb is False and adverbi is False and adjectiu is False: continue if ca_label.lower().strip() not in words: logging.debug("Discard not in word list: " + ca_label) continue if term_exists_in_index(index, ca_label, en_label): logging.debug("Discard already existing word in index: " + ca_label) continue # TODO: A better way to determine infinitives ca_label_str = to_str(ca_label) if verb is True and ca_label_str[len(ca_label_str) - 1] != 'r': logging.debug("Discard verb is not infinitive: " + ca_label) continue ca_desc = u'' textExtract = TextExtract(text) s = textExtract.GetDescription() if len(en_label) == 0 and len(es_label) == 0 and len(fr_label) == 0 and \ len(de_label) == 0 and len(it_label) == 0: logging.debug("Discard only ca_label:" + ca_label) continue if len(s) > 0: ca_desc = s ca_descs += 1 ca_labels += 1 if len(en_label) > 0: en_labels += 1 if len(es_label) > 0: es_labels += 1 if len(fr_label) > 0: fr_labels += 1 if len(de_label) > 0: de_labels += 1 if len(it_label) > 0: it_labels += 1 words_file_ca.write(ca_label.encode('utf-8') + '\r\n') if ca_desc is not None: s = '{0} - {1}\r\n'.format(ca_label.encode('utf-8'), ca_desc.encode('utf-8')) descriptions_file_ca.write(s) index.write_entry(word_en=en_label, word_ca=ca_label, word_fr=fr_label, word_de=de_label, word_es=es_label, word_it=it_label, definition_en=None, definition_ca=ca_desc, definition_fr=None, definition_de=None, definition_es=None, definition_it=None, image=None, permission=None, gec=None, wikidata_id=None, wikiquote_ca=None, wikidictionary_ca=ca_label, source=WIKIDICTIONARY) stats = { "ca_labels": ca_labels, "ca_descs": ca_descs, "en_labels": en_labels, "fr_labels": fr_labels, "de_labels": de_labels, "en_labels": en_labels, "es_labels": es_labels, "it_labels": it_labels } _show_statistics(stats) _save_statistics(stats) index.save() words_file_ca.close() descriptions_file_ca.close()