Esempio n. 1
0
def run(file, start_page=1, end_page=1000000):
    all_sentences = Sentence.objects.filter(language=Language.get('en')).order_by('id')
    paginator = Paginator(all_sentences,100)
    #pages = ((i,paginator.page(i)) for i in range(start_page,paginator.num_pages))

    def do_batch(sentences):
        preds = []
        for sentence in sentences:
            try:
                preds.extend(process_sentence(sentence))
            # changed to an improbable exception for now
            except Exception, e:
                # Add sentence
                e.sentence = sentence

                # Extract traceback
                e_type, e_value, e_tb = sys.exc_info()
                e.tb = "\n".join(traceback.format_exception( e_type, e_value, e_tb ))

                # Raise again
                raise e
        file.write('\n--- ')
        yaml.dump_all(preds, file)
from csc.util import queryset_foreach
from csc.conceptnet.models import Concept, SurfaceForm, Language, Assertion
from django.db import connection

en = Language.get('en')

def fix_surface(surface):
    norm, residue = en.nl.lemma_split(surface.text)
    if norm != surface.concept.text:
        print
        print "surface:", surface.text.encode('utf-8')
        print "concept:", surface.concept.text.encode('utf-8')
        print "normal:", norm.encode('utf-8')
        surface.update(norm, residue)

queryset_foreach(SurfaceForm.objects.filter(language=en),
  fix_surface,
  batch_size=100)


# plan:
#  fix surface form -> concept mapping
#  remove obsolete concepts
Esempio n. 3
0
from csc.util import queryset_foreach
from csc.conceptnet.models import Concept, SurfaceForm, Language, Assertion
from django.db import connection

en = Language.get('en')


def fix_surface(surface):
    norm, residue = en.nl.lemma_split(surface.text)
    if norm != surface.concept.text:
        print
        print "surface:", surface.text.encode('utf-8')
        print "concept:", surface.concept.text.encode('utf-8')
        print "normal:", norm.encode('utf-8')
        surface.update(norm, residue)


queryset_foreach(SurfaceForm.objects.filter(language=en),
                 fix_surface,
                 batch_size=100)

# plan:
#  fix surface form -> concept mapping
#  remove obsolete concepts