Exemple #1
0
def index(personDB, familyDB, relationDB):
    #config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
    config = IndexWriterConfig(analyzer)
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
    writer = IndexWriter(indexDir, config)
#?#indexWriter.setRAMBufferSizeMB(50);  KOLLA 256

    mt = matchtext()

    for p in personDB.find({}, no_cursor_timeout=True):
        matchtxt = mt.matchtextPerson(p, personDB, familyDB, relationDB)
        doc = Document()
        doc.add(Field('uid',str(p['_id']), StringField.TYPE_STORED))
        doc.add(Field('sex',str(p['sex']), StringField.TYPE_STORED))
        doc.add(Field("text", matchtxt, TextField.TYPE_NOT_STORED))
        writer.addDocument(doc)

    #Family matchtext
    for f in familyDB.find():
        matchtxt = mt.matchtextFamily(f, familyDB, personDB, relationDB)
        doc = Document()
        doc.add(Field('uid',str(f['_id']), StringField.TYPE_STORED))
        doc.add(Field('sex','FAM', StringField.TYPE_STORED))
        doc.add(Field("text", matchtxt, TextField.TYPE_NOT_STORED))
        writer.addDocument(doc)

    writer.commit()
    writer.close()
    return
Exemple #2
0
def viewNoRelErr(personIds, familyIds, config):
    #from luceneUtils import setupDir, search
    #setupDir(config['workDB'])
    searchDB = luceneDB(config['workDB'])
    from matchtext import matchtext
    mt_tmp = matchtext()
    person = config['persons'].find_one({'_id': personIds})
    matchtxt = mt_tmp.matchtextPerson(person, config['persons'],
                                      config['families'], config['relations'])
    candidates = searchDB.search(matchtxt, person['sex'], 5) #Lucene search
    tab = []
    tab.append(['Score',u'Namn/refId', u'Född', u'Död', '', u'Namn/refId', u'Född', u'Död'])
    for (kid,score) in candidates:
        if kid == personIds: continue
        cand = config['persons'].find_one({'_id': kid})
        if not cand: continue
        try:
            if abs(int(person['birth']['date'][0:4]) - int(cand['birth']['date'][0:4])) > 10:
                continue
        except:pass
        t = []
        t.append("%3.0f" % (score))
        t.extend(persTab(personIds, config['persons']))
        args =  {'where': 'visa', 'what': '/actions/mergePers',
                 'id1': str(personIds), 'id2': str(kid)}
        button = '<br><button onclick="doAction('+str(args)+')">Samma person</button>'
        t.append(button)
        t.extend(persTab(kid, config['persons']))
        tab.append(t)
    return (tab,'')
Exemple #3
0
 def updateDeleteRec(self, pid1, pid2, personDB, familyDB, relationDB):
     config = IndexWriterConfig(self.analyzer)
     config.setOpenMode(IndexWriterConfig.OpenMode.APPEND)
     writer = IndexWriter(self.indexDir, config)
     mt = matchtext()
     writer.deleteDocuments(Term('uid', pid1))
     writer.deleteDocuments(Term('uid', pid2))
     p = personDB.find_one({'_id': pid1})
     matchtxt = mt.matchtextPerson(p, personDB, familyDB, relationDB)
     doc = Document()
     doc.add(Field('uid',str(pid1), StringField.TYPE_STORED))
     doc.add(Field('sex',str(p['sex']), StringField.TYPE_STORED))
     doc.add(Field("match", matchtxt, TextField.TYPE_NOT_STORED))
     doc.add(Field("text", mt.luceneFix(self.personText(p)), TextField.TYPE_NOT_STORED))
     writer.addDocument(doc)
     writer.commit()
     writer.close()
     self.searcher = IndexSearcher(DirectoryReader.open(self.indexDir))
     return
Exemple #4
0
 def updateDeleteRec(self, pid1, pid2, personDB, familyDB, relationDB):
     config = IndexWriterConfig(self.analyzer)
     config.setOpenMode(IndexWriterConfig.OpenMode.APPEND)
     writer = IndexWriter(self.indexDir, config)
     mt = matchtext()
     writer.deleteDocuments(Term('uid', pid1))
     writer.deleteDocuments(Term('uid', pid2))
     p = personDB.find_one({'_id': pid1})
     matchtxt = mt.matchtextPerson(p, personDB, familyDB, relationDB)
     doc = Document()
     doc.add(Field('uid', str(pid1), StringField.TYPE_STORED))
     doc.add(Field('sex', str(p['sex']), StringField.TYPE_STORED))
     doc.add(Field("match", matchtxt, TextField.TYPE_NOT_STORED))
     doc.add(
         Field("text", mt.luceneFix(self.personText(p)),
               TextField.TYPE_NOT_STORED))
     writer.addDocument(doc)
     writer.commit()
     writer.close()
     self.searcher = IndexSearcher(DirectoryReader.open(self.indexDir))
     return
Exemple #5
0
    def index(self, personDB, familyDB, relationDB):
        """
        indexes a database
        Field match includes information about parents and is used to find matches
        Field text has Ids, names, places, and dates and is used to find a person/family
        """
        config = IndexWriterConfig(self.analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        writer = IndexWriter(self.indexDir, config)
        #indexWriter.setRAMBufferSizeMB(256)  #?

        mt = matchtext()

        for p in personDB.find({}, no_cursor_timeout=True):
            matchtxt = mt.matchtextPerson(p, personDB, familyDB, relationDB)
            doc = Document()
            doc.add(Field('uid', str(p['_id']), StringField.TYPE_STORED))
            doc.add(Field('sex', str(p['sex']), StringField.TYPE_STORED))
            doc.add(Field("match", matchtxt, TextField.TYPE_NOT_STORED))
            doc.add(
                Field("text", mt.luceneFix(self.personText(p)),
                      TextField.TYPE_NOT_STORED))
            writer.addDocument(doc)

        #Family matchtext
        for f in familyDB.find():
            #matchtxt = mt.matchtextFamily(f, familyDB, personDB, relationDB)
            doc = Document()
            doc.add(Field('uid', str(f['_id']), StringField.TYPE_STORED))
            #doc.add(Field('sex','FAM', StringField.TYPE_STORED))
            #doc.add(Field("match", matchtxt, TextField.TYPE_NOT_STORED))
            txt = f['_id']
            if 'refId' in f: txt += ' ' + f['refId']
            doc.add(Field("text", txt, TextField.TYPE_NOT_STORED))
            writer.addDocument(doc)

        writer.commit()
        writer.close()
        return
Exemple #6
0
    def index(self, personDB, familyDB, relationDB):
        """
        indexes a database
        Field match includes information about parents and is used to find matches
        Field text has Ids, names, places, and dates and is used to find a person/family
        """
        config = IndexWriterConfig(self.analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        writer = IndexWriter(self.indexDir, config)
        #indexWriter.setRAMBufferSizeMB(256)  #?

        mt = matchtext()

        for p in personDB.find({}, no_cursor_timeout=True):
            matchtxt = mt.matchtextPerson(p, personDB, familyDB, relationDB)
            doc = Document()
            doc.add(Field('uid',str(p['_id']), StringField.TYPE_STORED))
            doc.add(Field('sex',str(p['sex']), StringField.TYPE_STORED))
            doc.add(Field("match", matchtxt, TextField.TYPE_NOT_STORED))
            doc.add(Field("text", mt.luceneFix(self.personText(p)), TextField.TYPE_NOT_STORED))
            writer.addDocument(doc)

        #Family matchtext
        for f in familyDB.find():
            #matchtxt = mt.matchtextFamily(f, familyDB, personDB, relationDB)
            doc = Document()
            doc.add(Field('uid',str(f['_id']), StringField.TYPE_STORED))
            #doc.add(Field('sex','FAM', StringField.TYPE_STORED))
            #doc.add(Field("match", matchtxt, TextField.TYPE_NOT_STORED))
            txt = f['_id']
            if 'refId' in f: txt += ' ' + f['refId']
            doc.add(Field("text", txt, TextField.TYPE_NOT_STORED))
            writer.addDocument(doc)

        writer.commit()
        writer.close()
        return
Exemple #7
0
# -*- coding: utf-8 -*-
# This Python file uses the following encoding: utf-8
import common
from matchUtils import *
import os, sys, logging
from collections import defaultdict
from matchtext import matchtext
from luceneUtils import search, setupDir, index
mt_tmp = matchtext()
from dbUtils import getFamilyFromChild

#FIX! Normalization factor Lucene score
norm = {'kalle_testp1': 8.7, 'kalle_testp2': 5.1, 'kalle_testp3': 8.6,
        'kalle_testp4': 8.4, 'kalle_testp5': 10.2, 'default': 9.0}

def SVMfeatures(work, match, conf, score):
    return svmbaseline(work, match, conf, score)
#    alg = os.environ['SVMalgoritm']
#    if alg == 'all42': return svmall42(work, match, conf, score)
#    elif alg == 'baseline': return svmbaseline(work, match, conf, score)
#    elif alg == 'oldPerson': return svmbaseline(work, match, conf, score)
#    else:
#        print 'ERR unkown person algorithm:', alg
#        sys.exit()

def famSVMfeatures(work, match, conf, score):
    return svmFamily(work, match, conf)
#    alg = os.environ['famSVMalgoritm']
#    if alg == 'family': return svmFamily(work, match, conf)
#    else:
#        print 'ERR unkown family algorithm:', alg
Exemple #8
0
# -*- coding: utf-8 -*-
# This Python file uses the following encoding: utf-8
import re, sys, math
from datetime import date
from svmutil import svm_load_model, svm_predict
import importlib
from matchtext import matchtext
from dbUtils import getFamilyFromChild

mt_tmp = matchtext()
svmModel = False
SVMfeatures = None

_cache = {}


def cos(l1, l2):
    """ Similarity between two vectors = cosine for the angle between the vectors:
	cosine  = ( V1 * V2 ) / ||V1|| x ||V2||
	Vectors expressed as strings, split on blankspace, assume boolean weights  """
    v1 = l1.split()
    v2 = l2.split()
    s = 0
    for w1 in v1:
        if w1 in v2: s += 1
    return s / (math.sqrt(len(v1)) * math.sqrt(len(v2)))


def compName(n1, n2):
    """ Compare names: n1 n2 strings, blankspace separated names
        return value between -1 (mismatch) and 1 (match)
Exemple #9
0
def editList(config, typ):
    global dubblList
    tit = 'Verktyg databas: ' + config['workDB']
    doTyp = []
    if typ in ('child', 'family', 'relation', 'dubblett', 'dubblettFind'):
        doTyp.append(typ)
    else:
        dubblList = []
        doTyp.append('child')
        doTyp.append('family')
    childErrs = [['Typ', 'Person', 'Familjer', 'action']]
    famErrs = [['Typ', 'Personer', 'Familj', 'action']]
    relErrs = [['Typ', 'Person', '', 'action']]
    dubbls = []
    (cErr, fErr, rErr) = sanity(config['persons'], config['families'],
                                    config['relations'], do=doTyp)
    cErr = repairChild(cErr, config['persons'], config['families'],
                       config['relations'], config['originalData'])
    for (pers, chFams) in cErr:
        child = "%s %s" % (pers['_id'], pers['name'])
        args = {'where': 'visa', 'what': '/view/relErr', 'typ': 'child',
                'person': str(pers['_id']), 'family': str(':'.join(chFams))}
        visa = '<button onclick="doAction('+str(args)+')">Visa</button>'
        childErrs.append(['Child', child, '; '.join(chFams), visa])
    #1 husb/wife per family
    fErr = repairFam(fErr, config['persons'], config['families'],
                       config['relations'], config['originalData'])
    for (famId, persList) in fErr:
        person = []
        pids = []
        for pers in persList:
            person.append("%s %s" % (pers['_id'], pers['name']))
            pids.append(pers['_id'])
        args = {'where': 'visa', 'what': '/view/relErr', 'typ': 'partner',
                'person': str(':'.join(pids)), 'family': str(famId)}
        visa = '<button onclick="doAction('+str(args)+')">Visa</button>'
        famErrs.append(['Partner', '<br>'.join(person), famId, visa])
    rErr = repairRel(rErr, config['persons'], config['families'],
                       config['relations'], config['originalData'])
    for pid in rErr:
        pers = config['persons'].find_one({'_id': pid})
        person = "%s %s" % (pers['_id'], pers['name'])
        args = {'where': 'visa', 'what': '/view/relErr', 'typ': 'noRel',
                'person': str(pers['_id']), 'family': ''}
        visa = '<button onclick="doAction('+str(args)+')">Visa</button>'
        relErrs.append(['Inga relationer', person, '', visa])
    if typ in ('dubblett', 'dubblettFind'):
        tit = 'Dubblett editor databas: ' + config['workDB']
        if len(dubblList) == 0 or typ=='dubblettFind':
            searchDB = luceneDB(config['workDB'])
            from matchtext import matchtext
            mt_tmp = matchtext()
            tab = []
            done = []
            tab.append(['Score',u'Namn/Id', 'Kandidat Id'])
            for person in config['persons'].find():
                matchtxt = mt_tmp.matchtextPerson(person, config['persons'],
                                                  config['families'], config['relations'])
                txt = []
                for term in matchtxt.split():
                    if term.startswith('Father') or term.startswith('Mother'): continue
                    txt.append(term)
                #candidates = search(' '.join(txt), person['sex'], 4) #Lucene search
                candidates = searchDB.search(' '.join(txt), person['sex'], 4) #Lucene search
                pstr = "%s %s" % (person['_id'], person['name'])
                for (kid,score) in candidates:
                    if kid == person['_id']: continue
                    if ';'.join([person['_id'],kid]) in done: continue
                    cand = config['persons'].find_one({'_id': kid})
                    if not cand: continue
                    try:
                        if abs(int(person['birth']['date'][0:4]) - int(cand['birth']['date'][0:4])) > 10:
                            continue
                    except:pass
                    args = {'where': 'visa', 'what': '/view/relErr', 'typ': 'dubblett',
                            'person': str(person['_id']), 'family': str(kid)}
                    visa = '<button onclick="doAction('+str(args)+')">Visa</button>'
                    tab.append(["%3.0f" % (score), pstr, kid, visa])
                    done.append(';'.join([kid,person['_id']]))
            dubblList = sorted(tab, key=itemgetter(0), reverse=True)[0:25]
            findAndMergeDuplFams(config['persons'], config['families'],
                                 config['relations'], config['originalData'])
            #OBS does not update luceneDB - OK?
        dubbls = dubblList
    return (tit, childErrs, famErrs, relErrs, dubbls)