def decant( corpus, routine, settings, ref_completion=1.0 ): from sven.anta.models import Analysis, Routine, Segment, Segment_Concept, Document, Document_Segment, Document_Tag, Tag, Concept from sven.anta.utils import textify # path = settings.MEDIA_ROOT + options.corpus # print NL_STOPWORDS logger.info( "starting pattern analysis on corpus: '%s' [%s]" % ( corpus.name, corpus.id ) ) # get document corpus log_routine( routine, entry="[info] starting pattern analysis on corpus: %s" % corpus.id ) # current analysis, if any try: analysis = Analysis.objects.get(corpus=corpus, type="PT") except Analysis.DoesNotExist: analysis = Analysis( corpus=corpus, type="PT", start_date=datetime.now(), status="CRE" ) analysis.save() except: raise routine.analysis.add( analysis ) routine.save() # total count total_count = Document.objects.filter(corpus=corpus).count() # current document (new) documents = Document.objects.filter(corpus__id=corpus.id, status='NEW') logger.info( "['%s':%s] documents to be analyzed: %s" % ( corpus.name, corpus.id, documents.count() ) ) # if analysis.document is None: # documents = Document.objects.filter(corpus__id=corpus.id) # analysis.document = documents[0] #else: # documents = Document.objects.filter(corpus__id=corpus.id, id__gt=analysis.document.id) # if documents.count() == 0: # documents = Document.objects.filter(corpus__id=corpus.id) # pending status for current analysis analysis.status = "PN" analysis.save() i = 0 # cycle through documents for d in documents: i = i + 1 d.status='IN' d.save() # update analysis with current document analysis.document = d analysis.save() log_routine( routine, completion=ref_completion * i / total_count ) # a = Analysis( document=d, ) logger.info("%s / %s trying to convert document '%s' [%s], mimetype %s" % ( i, total_count, d.title, d.id, d.mime_type) ) try: textified = textify( d, settings.MEDIA_ROOT ) except Exception, e: analysis.status="ERR" d.status = 'ERR' d.save() analysis.save() logger.error("%s / %s FAILED converting document '%s' [%s], mimetype %s with Exception: %s" % ( i, total_count, d.title, d.id, d.mime_type, e) ) continue if textified == False: analysis.status="ERR" d.status = 'ERR' d.save() analysis.save() logger.error("%s / %s FAILED converting document '%s' [%s], mimetype %s" % ( i, total_count, d.title, d.id, d.mime_type) ) continue textified = textified.replace("%20"," ") analysis.completion = 0.0 analysis.document = d analysis.save() # load storpwords for document d language if d.language == "NL": stopwords = NL_STOPWORDS elif d.language == "EN": stopwords = EN_STOPWORDS else: stopwords = [] logger.info("%s / %s NP extraction started over document '%s' [%s], language: '%s', file: '%s'" % ( i, total_count, d.title, d.id, d.language, textified) ) #start distill anaysis, exclude given stopwors try: distilled = distill( filename=textified, language=d.language.lower(), stopwords=stopwords ) except Exception, e: d.status = 'ERR' logger.exception("EXCEPTION distill document") logger.error("%s / %s FAILED distill document '%s' [%s], mimetype %s with Exception: %s" % ( i, total_count, d.title, d.id, d.mime_type, e) ) d.save() continue
for d in documents: i = i + 1 d.status='IN' d.save() # update analysis with current document analysis.document = d analysis.save() log_routine( routine, completion=ref_completion * i / total_count ) # a = Analysis( document=d, ) logger.info("[%s:%s] %s / %s trying to convert document '%s' [%s], mimetype %s" % (corpus.name, corpus.id, i, total_count, d.title, d.id, d.mime_type)) try: textified = textify( d, settings.MEDIA_ROOT ) except Exception, e: logger.exception('[%s:%s] Exception: textify function failed on document "%s"[%s]' % (corpus.name, corpus.id, d.title, d.id)) analysis.status="ERR" d.status = 'ERR' d.save() analysis.save() continue if textified == False: analysis.status="ERR" d.status = 'ERR' d.save() analysis.save() logger.error("%s / %s FAILED converting document '%s' [%s], mimetype %s" % ( i, total_count, d.title, d.id, d.mime_type) ) continue
def decant( corpus, routine, settings, ref_completion=1.0 ): from sven.anta.models import Analysis, Routine, Segment, Segment_Concept, Document, Document_Segment, Document_Tag, Tag, Concept from sven.anta.utils import textify # path = settings.MEDIA_ROOT + options.corpus # print NL_STOPWORDS # get document corpus print "[info] starting pattern analysis on corpus:",corpus.id, corpus.name log_routine( routine, entry="[info] starting pattern analysis on corpus: %s" % corpus.id ) # current analysis, if any try: analysis = Analysis.objects.get(corpus=corpus, type="PT") except Analysis.DoesNotExist: analysis = Analysis( corpus=corpus, type="PT", start_date=datetime.now(), status="CRE" ) analysis.save() except: raise routine.analysis.add( analysis ) routine.save() # total count total_count = Document.objects.filter(corpus=corpus).count() # current document if analysis.document is None: documents = Document.objects.filter(corpus__id=corpus.id) analysis.document = documents[0] else: documents = Document.objects.filter(corpus__id=corpus.id, id__gt=analysis.document.id) if documents.count() == 0: documents = Document.objects.filter(corpus__id=corpus.id) # pending status for current analysis analysis.status = "PN" analysis.save() i = 0 # cycle through documents for d in documents: i = i + 1 # update analysis with current document analysis.document = d analysis.save() log_routine( routine, completion=ref_completion * i / total_count ) # a = Analysis( document=d, ) print "[info] document mimetype:",d.mime_type textified = textify( d, settings.MEDIA_ROOT ) if textified == False: analysis.status="ERR" analysis.save() raise Exception("error in textify function") textified = textified.replace("%20"," ") analysis.completion = 0.0 analysis.document = d analysis.save() # load storpwords for document d language if d.language == "NL": stopwords = NL_STOPWORDS elif d.language == "EN": stopwords = EN_STOPWORDS else: stopwords = [] print "[info] document language:",d.language print "[info] analysis started on doc ", d.id,"'", d.title,"'", d.language.lower(), "file:",textified #start distill anaysis, exclude given stopwors distilled = distill( filename=textified, language=d.language.lower(), stopwords=stopwords ) analysis.completion = .1 analysis.save() # append keywords as tag for the document for k in distilled['keywords']: # print k candidate = k[1] # get tag try: t = Tag.objects.get( name=candidate, type="keyword" ) except: # todo lemma version of a word according to language t = Tag( name=candidate, type="keyword" ) try: t.save() except: print "[warning] unable to save as tag:", candidate continue # set tag documnt relation try: td = Document_Tag( document=d, tag=t) td.save() except: #relation exist, continue analysis.completion = .5 analysis.save() # segment first = True for segment in distilled['segments']: if len(segment[0]) > 128: print "[warning] sample 'segment' will be truncated:", segment[0] continue try: s = Segment.objects.get( content=segment[0][:128], language=d.language) except: s = Segment( content=segment[0][:128], stemmed=re.sub("\s+", ' ', " ".join(segment[1])[:128] ), language=d.language ) try: s.save() except: print "[warning] unable to save segment:", segment[0][:128] continue try: sd = Document_Segment.objects.get( document=d, segment=s ) except: sd = Document_Segment( document=d, segment=s, tf=segment[2] ) sd.save() # relationship exist if first: print "[info] sample 'segment' saved:", s.id, s.content, ", stem:", s.stemmed ,", tf:", sd.tf # save concept and attach for k in segment[1]: # ignore numbers k = re.sub("[\d\-\.]+","", k) if len(k) < 2: continue try: c = Concept.objects.get( content=k, language=d.language) except: try: c = Concept( content=k, language=d.language ) c.save() except Exception, e: print "[warning] unable to save concept: %s, exception: %s" % (k, e) continue try: sc = Segment_Concept.objects.get( segment=s, concept=c ) except: sc = Segment_Concept( segment=s, concept=c ) sc.save() if first: print "[info] sample 'concept' saved:",c.id, c.content first = False print "[info] analysis ended on doc", d.id,"'", d.title,"'"