def run(file, start_page=1, end_page=1000000): all_sentences = Sentence.objects.filter(language=Language.get('en')).order_by('id') paginator = Paginator(all_sentences,100) #pages = ((i,paginator.page(i)) for i in range(start_page,paginator.num_pages)) def do_batch(sentences): preds = [] for sentence in sentences: try: preds.extend(process_sentence(sentence)) # changed to an improbable exception for now except Exception, e: # Add sentence e.sentence = sentence # Extract traceback e_type, e_value, e_tb = sys.exc_info() e.tb = "\n".join(traceback.format_exception( e_type, e_value, e_tb )) # Raise again raise e file.write('\n--- ') yaml.dump_all(preds, file)
from csc.util import queryset_foreach from csc.conceptnet.models import Concept, SurfaceForm, Language, Assertion from django.db import connection en = Language.get('en') def fix_surface(surface): norm, residue = en.nl.lemma_split(surface.text) if norm != surface.concept.text: print print "surface:", surface.text.encode('utf-8') print "concept:", surface.concept.text.encode('utf-8') print "normal:", norm.encode('utf-8') surface.update(norm, residue) queryset_foreach(SurfaceForm.objects.filter(language=en), fix_surface, batch_size=100) # plan: # fix surface form -> concept mapping # remove obsolete concepts