Ejemplo n.º 1
0
 def merge_file(filename):
     data = Data(filename)
     output = data.get('OUTPUT')
     collections = data.collections()
     merge = Merge(collections)
     generator = merge.generator()
     return (generator, output)
Ejemplo n.º 2
0
def main():
    url="http://www.occ.gov/topics/licensing/interpretations-and-actions/index-interpretations-and-actions.html"
    
    r1=Range98_96(url)
    r2=Range04_99(url)
    r3=Range05_10(url)
    r4=Range11_12(url)
    m=Merge()
    print "                     WEB SCRAPING    "
    print "             ============================= "
    print "Fetching the url for range 1996-1998:"
    r1.fetchurl98_96()
    print "Fetching the url for range 1999-2004:"
    r2.fetchurl04_99()
    print "Fetching the url for range 2005-2010:"
    r3.fetchurl05_10()
    print "Fetching the url for range 2011-2012:"
    r4.fetchurl11_12()
    print "Fetching data from 1996-1998"
    r1.fetchdata98_96()
    print "Fetching data from 1999-2004"
    r2.fetchdata04_99()
    print "Fetching data from 2005-2010"
    r3.fetchdata05_10()
    print "Fetching data from 2011-2012"
    r4.fetchdata11_12()
    print "Merging all the csv files"
    m.call()
    print "Check the ouput CSV File (The Required Database) "
    print "         ***************************************  "
Ejemplo n.º 3
0
def task_merge_doubanvideo():
    m = Merge()
    while True:
        task = rd.spop("task_merge_doubanvideo")
        if task:
            m.merge_doubanvideo(query={"_id": ObjectId(task)})
        else:
            break
Ejemplo n.º 4
0
def task_merge_youku_videos():
    m = Merge()
    while True:
        task = rd.spop("task_merge_youku_videos")
        if task:
            m.merge_youku_videos(query={"_id": ObjectId(task)})
        else:
            break
Ejemplo n.º 5
0
def task_merge_letvstar():
    m = Merge()
    while True:
        task = rd.spop("task_merge_letvstar")
        if task:
            m.merge_letvstar(query={"_id": ObjectId(task)})
        else:
            break
Ejemplo n.º 6
0
 def test_range(self):
     cnt = 5
     lists = [iter(xrange(sys.maxint)) for _ in range(cnt)]
     merge = Merge(lists)
     generator = merge.generator()
     for value in range(10):
         for _ in range(cnt):
             assert next(generator) == value
Ejemplo n.º 7
0
    def test_infinity(self):
        def infinity(value):
            while True:
                yield value

        merge = Merge((infinity(1), infinity(2)))
        generator = merge.generator()
        assert next(generator) == 1
Ejemplo n.º 8
0
	def verify_fileds(self):
		# this funtion will check to
		# see if the fields or check
		# or not if not raise an erro message

		invisbale_button = self.invisbale_button.get()
		merge_button = self.merge_button.get()
		icon_button = self.icon_button.get()
		filename = self.entry_var.get()
		icon = self.icon.get()
		if filename.endswith(('.py','.pyw')):
			if invisbale_button == True and merge_button == True and icon_button == True:
				if icon.endswith('.ico'):
					# Calling the ain1 class
					a1 = Ain1(self.master,filename,icon)
					a1.merge()
				else:
					messagebox.showerror(parent=self.master,title='Icon Error',message="Icon file should end with a (.ico) extension")
			elif invisbale_button == 1 and merge_button == 1:
				# callign the inmerge class
				inmerge = Inmerge(self.master,filename)
				inmerge.merge()
			elif invisbale_button == True and icon_button == True:
				if icon.endswith('.ico'):
					#calling the inicon class
					inicon = Inicon(self.master,filename,icon)
					inicon.merge()
				else:
					messagebox.showerror(parent=self.master,title='Icon Error',message="Icon file should end with a (.ico) extension")
			elif merge_button == True and icon_button ==  True:
				if icon.endswith('.ico'):
					#calling the mergeicon class
					mergeicon = MergeIcon(self.master,filename,icon)
					mergeicon.merge()
				else:
					messagebox.showerror(parent=self.master,title='Icon Error',message="Icon file should end with a (.ico) extension")
			elif invisbale_button == True:
				#calling the hidden class
				hidden = Hidden(self.master,filename)
				hidden.merge()
			elif merge_button == True:
				#calling the merge class
				merge = Merge(self.master,filename)
				merge.merge()
			elif icon_button == True:
				messagebox.showerror(parent=self.master,title='Icon Error',message="Icon cannot be merged alone")
			else:
				messagebox.showerror(parent=self.master,title='Invalid Error',message="Please atleast select merge")
		else:
			messagebox.showerror(parent=self.master,title='Filename Error',message="File shoud end with a (.py or .pyw) extention.")
Ejemplo n.º 9
0
 def selBtnEvent(self):
     for widget in self.workFrm.winfo_children():
         widget.destroy()
     if self.selV.get() == 1:
         Merge(self)
     if self.selV.get() == 2:
         Split(self)
     if self.selV.get() == 3:
         Insert(self)
     if self.selV.get() == 4:
         Delete(self)
Ejemplo n.º 10
0
    def create_objects(self, test_directory):
        """Traverses the XML structure and creates test objects

        Arguments:
        test_directory - The test directory.
        """
        for element in self.root:
            if element.tag == 'maker':
                self.tests.append(Maker(element, test_directory))
            if element.tag == 'merge':
                self.tests.append(Merge(element, test_directory))
Ejemplo n.º 11
0
def Experiment():
	l = 2**9
	M= 200
	N = l*M
	i=1
	#M = 3 * 2**24

	print "Experiment, l: " + str(l) + ' M: ' + str(M)
	origin = Sfile('origin')
	inputs = Sfile('inputs')
	outputs= Sfile('outputs')

	origin.delete()
	inputs.delete()
	outputs.delete()

	origin.reopen()
	inputs.reopen()
	outputs.reopen()

	writeRandomNumbers(origin,N)
	origin.reopen()

	init_time = time()
	Ms = Merge(l,M,origin,inputs,outputs) 
	Ms.mergesort_k(i)
	end_time = time()

	print 'time: ' + str(end_time - init_time) + ' [sg]'

	#outputs.close()
	#outputs.reopen()
	#outputs.seek(0,0)
	#values = outputs.reads(N)
	#sortTest(values)
	#print str(len(values)) + ' result: ' + str(values)

	origin.close()
	inputs.close()
	outputs.close()
Ejemplo n.º 12
0
def launch(baseType='mecanum', debug=True, record=True, openViewer=False, openROS=False):
    global __isLaunched
    if not __isLaunched:
        global base, merge, dash, feed, flow, remote, spy
        base = Mecanum() if baseType == 'mecanum' else Omni()
        merge = Merge()
        dash = Dash(base, merge)
        feed = Feed(dash)
        flow = Flow(dash, debug=debug)
        remote = Remote(dash, flow)
        spy = Spy(dash)
        __isLaunched = True
    base.enable()
    if record:
        spy.begin()
    if openViewer:
        from viewer import Viewer
        global viewer
        viewer = Viewer(dash)
    if openROS:
        from ros import Ros
        global ros
        ros = Ros(dash)
Ejemplo n.º 13
0
def task_groupLanguageAndProducer_country():
    m = Merge()
    m.groupLanguageAndProducer_country(query={})
Ejemplo n.º 14
0
def task_rm_unknown():
    m = Merge()
    m.rm_unknown(query={})
Ejemplo n.º 15
0
def task_clean_category():
    m = Merge()
    m.clean_category(query={})
Ejemplo n.º 16
0
# from scrapy.cmdline import execute
# from os import system
import subprocess
from merge import Merge

name = '22tu'

if __name__ == '__main__':
    # cmd = f'scrapy crawl {name}'
    # execute(cmd.split(' '))
    for episode in range(30, 31):
        SOAP_ID = '28230'
        EPISODE = episode
        SOAP_NAME = 'doutinghao'
        print(f"正在下载的片名是:{SOAP_NAME}\n剧集是:{EPISODE}")
        child_crawl = subprocess.call(f"scrapy crawl {name} \
        -a SOAP_ID={SOAP_ID} -a EPISODE={EPISODE} -a SOAP_NAME={SOAP_NAME}")
        # child_crawl.wait()
        print(f'正在合成ts文件')
        merge = Merge(SOAP_NAME=SOAP_NAME, EPISODE=EPISODE)
        merge.main()
        print(f"{SOAP_NAME}_{EPISODE}下载完毕")
Ejemplo n.º 17
0
def task_categories():
    m = Merge()
    m.groupCategories(query={})
Ejemplo n.º 18
0
 def test_numbers(self):
     lists = ((1, 2), (2, 3, 5), (3, 4))
     lists = [iter(l) for l in lists]
     merge = Merge(lists)
     generator = merge.generator()
     assert list(generator) == [1, 2, 2, 3, 3, 4, 5]
Ejemplo n.º 19
0
output_filename = 'output.docx'
output_filepath = os.path.join(os.getcwd(), output_filename)

tmp_dir1 = os.path.join(os.getcwd(), 'tmp\\split')
tmp_dir2 = os.path.join(os.getcwd(), 'tmp\\split-processed')

from misc import mkdir

if __name__ == "__main__":
    #
    extract = Extract(input_filepath)
    extract.process()  #output_filepath=extract_filepath)
    db['project_info'].set_db(extract.extract_project_infos())
    #
    mkdir(tmp_dir1)
    split = Split(input_filepath=extract_filepath)
    sections = split.process()
    #
    db['finance'].filtering(need_years=3)
    db['human'].select_people(name_list=['总经理姓名', '联系人姓名', '项目经理人姓名'])
    db['projects_done'].filtering(project_types=['水利'], need_years=3)
    db['projects_being'].filtering(project_types=['水利'])
    #
    mkdir(tmp_dir2)
    for section in sections:
        fillin = FillIn(os.path.join(tmp_dir1, section + '.docx'))
        fillin.process(os.path.join(tmp_dir2, section + '.docx'))
    #
    merge = Merge(tmpdir=tmp_dir2, section_names=sections)
    merge.process(output_filepath)
    #
Ejemplo n.º 20
0
class Extract:
    def __init__(self, jieba=False):
        import re
        self.de = re.compile(u"[\u4e00-\u9fa5]")
        self.jieba = jieba
        self.relation = {
            u'fuqin': ('PERSON', 'PERSON'),
            u'erzi': ('PERSON', 'PERSON'),
            u'nver': ('PERSON', 'PERSON'),
            u'nvyou': ('PERSON', 'PERSON'),
            u'nanyou': ('PERSON', 'PERSON'),
            u'muqin': ('PERSON', 'PERSON'),
            u'emma': ('PERSON', 'PERSON'),
            u'zhangfu': ('PERSON', 'PERSON'),
            u'qizi': ('PERSON', 'PERSON'),
            u'\u5973\u53cb': ('PERSON', 'PERSON'),
            u'\u5973\u513f': ('PERSON', 'PERSON'),
            u'\u59bb\u5b50': ('PERSON', 'PERSON'),
            u'\u4e08\u592b': ('PERSON', 'PERSON'),
            u'\u524d\u592b': ('PERSON', 'PERSON'),
            u'\u7236\u4eb2': ('PERSON', 'PERSON'),
            u'\u8eab\u9ad8': ('PERSON', 'HEIGHT'),
            u'\u751f\u65e5': ('PERSON', 'DATE'),
            u'\u64ad\u51fa\u65f6\u95f4': ('FILM', 'TIME'),
            u'\u4e3b\u9898\u66f2': ('FILM', 'MUSIC')
        }
        self.pos_tagger = {
            'a': 0,
            'ad': 1,
            'ag': 2,
            'an': 3,
            'b': 4,
            'bg': 5,
            'c': 6,
            'd': 7,
            'df': 8,
            'dg': 9,
            'e': 10,
            'en': 11,
            'f': 12,
            'g': 13,
            'h': 14,
            'i': 15,
            'in': 16,
            'j': 17,
            'jn': 18,
            'k': 19,
            'l': 20,
            'ln': 21,
            'm': 22,
            'mg': 23,
            'mq': 24,
            'n': 25,
            'ng': 26,
            'nr': 27,
            'nrfg': 28,
            'nrt': 29,
            'ns': 30,
            'nt': 31,
            'nz': 32,
            'o': 33,
            'p': 34,
            'q': 35,
            'qe': 36,
            'qg': 37,
            'r': 38,
            'rg': 39,
            'rr': 40,
            'rz': 41,
            's': 42,
            't': 43,
            'tg': 44,
            'u': 45,
            'ud': 46,
            'ug': 47,
            'uj': 48,
            'ul': 49,
            'uv': 50,
            'uz': 51,
            'v': 52,
            'vd': 53,
            'vg': 54,
            'vi': 55,
            'vn': 56,
            'vq': 57,
            'w': 58,
            'x': 59,
            'y': 60,
            'yg': 61,
            'z': 62,
            'zg': 63,
            'a': 64,
            'ad': 65,
            'ag': 66,
            'an': 67,
            'b': 68,
            'bg': 69,
            'c': 70,
            'd': 71,
            'df': 72,
            'dg': 73,
            'e': 74,
            'en': 75,
            'f': 76,
            'g': 77,
            'h': 78,
            'i': 79,
            'in': 80,
            'j': 81,
            'jn': 82,
            'k': 83,
            'l': 84,
            'ln': 85,
            'm': 86,
            'mg': 87,
            'mq': 88,
            'n': 89,
            'ng': 90,
            'nr': 91,
            'nrfg': 92,
            'nrt': 93,
            'ns': 94,
            'nt': 95,
            'nz': 96,
            'o': 97,
            'p': 98,
            'q': 99,
            'qe': 100,
            'qg': 101,
            'r': 102,
            'rg': 103,
            'rr': 104,
            'rz': 105,
            's': 106,
            't': 107,
            'tg': 108,
            'u': 109,
            'ud': 110,
            'ug': 111,
            'uj': 112,
            'ul': 113,
            'uv': 114,
            'uz': 115,
            'v': 116,
            'vd': 117,
            'vg': 118,
            'vi': 119,
            'vn': 120,
            'vq': 121,
            'w': 122,
            'x': 123,
            'y': 124,
            'yg': 125,
            'z': 126,
            'zg': 127,
            'a': 128,
            'ad': 129,
            'ag': 130,
            'an': 131,
            'b': 132,
            'bg': 133,
            'c': 134,
            'd': 135,
            'df': 136,
            'dg': 137,
            'e': 138,
            'en': 139,
            'f': 140,
            'g': 141,
            'h': 142,
            'i': 143,
            'in': 144,
            'j': 145,
            'jn': 146,
            'k': 147,
            'l': 148,
            'ln': 149,
            'm': 150,
            'mg': 151,
            'mq': 152,
            'n': 153,
            'ng': 154,
            'nr': 155,
            'nrfg': 156,
            'nrt': 157,
            'ns': 158,
            'nt': 159,
            'nz': 160,
            'o': 161,
            'p': 162,
            'q': 163,
            'qe': 164,
            'qg': 165,
            'r': 166,
            'rg': 167,
            'rr': 168,
            'rz': 169,
            's': 170,
            't': 171,
            'tg': 172,
            'u': 173,
            'ud': 174,
            'ug': 175,
            'uj': 176,
            'ul': 177,
            'uv': 178,
            'uz': 179,
            'v': 180,
            'vd': 181,
            'vg': 182,
            'vi': 183,
            'vn': 184,
            'vq': 185,
            'w': 186,
            'x': 187,
            'y': 188,
            'yg': 189,
            'z': 190,
            'zg': 191,
            'a': 192,
            'ad': 193,
            'ag': 194,
            'an': 195,
            'b': 196,
            'bg': 197,
            'c': 198,
            'd': 199,
            'df': 200,
            'dg': 201,
            'e': 202,
            'en': 203,
            'f': 204,
            'g': 205,
            'h': 206,
            'i': 207,
            'in': 208,
            'j': 209,
            'jn': 210,
            'k': 211,
            'l': 212,
            'ln': 213,
            'm': 214,
            'mg': 215,
            'mq': 216,
            'n': 217,
            'ng': 218,
            'nr': 219,
            'nrfg': 220,
            'nrt': 221,
            'ns': 222,
            'nt': 223,
            'nz': 224,
            'o': 225,
            'p': 226,
            'q': 227,
            'qe': 228,
            'qg': 229,
            'r': 230,
            'rg': 231,
            'rr': 232,
            'rz': 233,
            's': 234,
            't': 235,
            'tg': 236,
            'u': 237,
            'ud': 238,
            'ug': 239,
            'uj': 240,
            'ul': 241,
            'uv': 242,
            'uz': 243,
            'v': 244,
            'vd': 245,
            'vg': 246,
            'vi': 247,
            'vn': 248,
            'vq': 249,
            'w': 250,
            'x': 251,
            'y': 252,
            'yg': 253,
            'z': 254,
            'zg': 255,
            'eng': 256
        }
        self.m = Merge(True, False)
        #self.m = Merge(True,True)
        pass

    #get the ner using merge and search the relation's Ner
    def _process_data(self, lines, newwords, n2, tags=None):
        s = []
        p = []
        _seg = []
        _ner = []
        self.m.add_new_words(newwords)
        if n2 is not None:
            self.m.add_new_words(n2)
        for i in xrange(len(lines)):
            line = lines[i]
            (line_seg, line_pos, line_ner) = self.m.ner_using_nlpc(line)
            #(line_ner,line_pos,line_seg,line_dep) = self.m.get_line_info(line,False)
            if tags is not None:
                tag = tags[i]
                k = line_ner.count((self.relation[tag.decode('utf-8')])[1])
                if k == 0:
                    continue
                elif k == 1:
                    if (self.relation[tag.decode('utf-8')])[1] == (
                            self.relation[tag.decode('utf-8')])[1]:
                        continue
            else:
                return
            seg = line_seg.split('\t')
            pos = line_pos.split('\t')
            ner = line_ner.split('\t')
            s.append(newwords[i][0].decode('utf-8'))
            p.append(tag)
            _seg.append(seg)
            _ner.append(ner)
        return (s, p, _seg, _ner)

    def statistics(self, newwords, tags, segs, ners):
        s = []
        p = []
        answer = []
        fromline = []
        for i in xrange(len(tags)):
            tag = tags[i]
            seg = segs[i]
            ner = ners[i]
            _a = []
            print ' '.join(seg).encode('utf-8')
            for id in xrange(len(seg)):
                if tags is not None:
                    if ner[id] == (self.relation[tag.decode('utf-8')])[1]:
                        ll = len(self.de.findall(seg[id]))
                        if ll == 0:
                            ll = len(seg[id])
                        if (seg[id] !=
                                newwords[i]) and (seg[id] not in _a) and (
                                    ll > 1) and seg[id].isdigit() == False:
                            print newwords[i].encode(
                                'utf-8') + ',' + tag.encode(
                                    'utf-8') + ',' + seg[id].encode('utf-8')
                            _a.append(seg[id])
                            answer.append(seg[id])
                            s.append(newwords[i])
                            fromline.append(''.join(seg))
                            p.append(tag)
        dict = collections.OrderedDict()
        for i in xrange(len(s)):
            s[i] = s[i].decode('utf-8')
            spo = s[i] + p[i] + answer[i]
            if spo in dict:
                dict[spo][2] += 1
            else:
                dict[spo] = []
                dict[spo].append(s[i] + '\t' + p[i])
                dict[spo].append(answer[i])
                dict[spo].append(1)
                dict[spo].append(fromline[i])
        #result = {'sp':[[answer,count,line]]}
        result = collections.OrderedDict()
        for (k, v) in dict.items():
            sp = v[0]
            if sp in result:
                if v[2] > result[sp][0][1]:
                    result[sp] = []
                    ddd = []
                    ddd.append(v[1])
                    ddd.append(v[2])
                    ddd.append(v[3])
                    result[sp].append(ddd)
                elif v[2] == result[sp][0][1]:
                    ddd = []
                    ddd.append(v[1])
                    ddd.append(v[2])
                    ddd.append(v[3])
                    result[sp].append(ddd)
            else:
                result[sp] = []
                ddd = []
                ddd.append(v[1])
                ddd.append(v[2])
                ddd.append(v[3])
                result[sp].append(ddd)
        list = []
        for (k, v) in result.items():
            for i in xrange(len(v)):
                value = v[i]
                if value[1] == 1:
                    list.append(k + '\t' + value[0] + '\t' + 'not sure' +
                                '\t' + value[2])
                else:
                    list.append(k + '\t' + value[0] + '\t' + str(value[1]) +
                                '\t' + value[2])
        return list

    def test3(self):
        lines = []
        tags = []
        newwords = []
        newwords2 = []
        for line in sys.stdin:
            try:
                line = line.split('\t')
                if len(line) < 5:
                    print 'read wrong:' + '\t'.join(line)
                    continue
                tags.append(line[1])
                newwords.append(
                    (line[0], (self.relation[line[1].decode('utf-8')])[0]))
                newwords2.append(
                    (line[2], (self.relation[line[1].decode('utf-8')])[0]))
                if line[4].strip() != '':
                    lines.append(line[4].strip())
                else:
                    print 'read wrong:' + '\t'.join(line)
            except:
                print 'read wrong:' + '\t'.join(line)
        (s, p, _seg, _ner) = self._process_data(lines,
                                                newwords,
                                                None,
                                                tags=tags)
        list = self.statistics(s, p, _seg, _ner)
        for l in list:
            print l.encode('utf-8')

    def test2(self):
        lines = []
        tags = []
        newwords = []
        newwords2 = []
        for line in sys.stdin:
            try:
                line = line.split('\t')
                if len(line) < 6:
                    print 'read wrong:' + '\t'.join(line)
                    continue
                tags.append(line[1])
                newwords.append(
                    (line[0], (self.relation[line[1].decode('utf-8')])[0]))
                newwords2.append(
                    (line[2], (self.relation[line[1].decode('utf-8')])[0]))
                if line[5].strip() != '':
                    lines.append(line[5].strip())
                else:
                    print 'read wrong:' + '\t'.join(line)
            except:
                print 'read wrong:' + '\t'.join(line)
        (s, p, _seg, _ner) = self._process_data(lines,
                                                newwords,
                                                newwords2,
                                                tags=tags)
        list = self.statistics(s, p, _seg, _ner)
        for l in list:
            print l.encode('utf-8')

    def test1(self):
        lines = []
        tags = []
        newwords = []
        newwords2 = []
        ss = ''
        pstr = ''
        anstr = ''
        check = False
        wf = open('result_fanhua_1', 'ab')
        current = 0
        all = 0
        for line in sys.stdin:
            line = line.split('\t')
            if len(line) < 5:
                print 'read wrong:' + '\t'.join(line)
                continue
            if ss != line[0] and check:
                (s, p, _seg, _ner) = self._process_data(lines,
                                                        newwords,
                                                        newwords2,
                                                        tags=tags)
                list = self.statistics(s, p, _seg, _ner)
                wf.write(ss + '\t' + pstr + '\t' + anstr + '\n')
                all += 1
                for l in list:
                    print 'result' + l
                    wf.write(l + "\n\n")
                    if l.split('\t')[2] == anstr:
                        current += 1
                print current
                lines = []
                tags = []
                newwords = []
                newwords2 = []
            ss = line[0]
            pstr = line[1]
            anstr = line[2]
            tags.append(line[1])
            newwords.append(
                (line[0], (self.relation[line[1].decode('utf-8')])[0]))
            newwords2.append(
                (line[2], (self.relation[line[1].decode('utf-8')])[0]))
            if line[4].strip() != '':
                check = True
                lines.append(line[4].strip())
            else:
                print 'read wrong:' + '\t'.join(line)
        wf.write('all' + str(all))
        wf.write('current' + str(current))
        wf.close()

    def test(self):
        lines = []
        tags = []
        newwords = []
        for line in sys.stdin:
            try:
                line = line.split(' \t')
                tags.append(line[0].split('\t')[1])
                newwords.append((line[0].split('\t')[0], (
                    self.relation[line[0].split('\t')[1].decode('utf-8')])[0]))
                lines.append(line[1].strip())
            except:
                print line
                quit()
        (s, p, answer) = self._process_data(lines, newwords, None, tags=tags)
        list = self.statistics(s, p, answer)
        for l in list:
            print l.encode('utf-8')
Ejemplo n.º 21
0
 def __init__(self, jieba=False):
     import re
     self.de = re.compile(u"[\u4e00-\u9fa5]")
     self.jieba = jieba
     self.relation = {
         u'fuqin': ('PERSON', 'PERSON'),
         u'erzi': ('PERSON', 'PERSON'),
         u'nver': ('PERSON', 'PERSON'),
         u'nvyou': ('PERSON', 'PERSON'),
         u'nanyou': ('PERSON', 'PERSON'),
         u'muqin': ('PERSON', 'PERSON'),
         u'emma': ('PERSON', 'PERSON'),
         u'zhangfu': ('PERSON', 'PERSON'),
         u'qizi': ('PERSON', 'PERSON'),
         u'\u5973\u53cb': ('PERSON', 'PERSON'),
         u'\u5973\u513f': ('PERSON', 'PERSON'),
         u'\u59bb\u5b50': ('PERSON', 'PERSON'),
         u'\u4e08\u592b': ('PERSON', 'PERSON'),
         u'\u524d\u592b': ('PERSON', 'PERSON'),
         u'\u7236\u4eb2': ('PERSON', 'PERSON'),
         u'\u8eab\u9ad8': ('PERSON', 'HEIGHT'),
         u'\u751f\u65e5': ('PERSON', 'DATE'),
         u'\u64ad\u51fa\u65f6\u95f4': ('FILM', 'TIME'),
         u'\u4e3b\u9898\u66f2': ('FILM', 'MUSIC')
     }
     self.pos_tagger = {
         'a': 0,
         'ad': 1,
         'ag': 2,
         'an': 3,
         'b': 4,
         'bg': 5,
         'c': 6,
         'd': 7,
         'df': 8,
         'dg': 9,
         'e': 10,
         'en': 11,
         'f': 12,
         'g': 13,
         'h': 14,
         'i': 15,
         'in': 16,
         'j': 17,
         'jn': 18,
         'k': 19,
         'l': 20,
         'ln': 21,
         'm': 22,
         'mg': 23,
         'mq': 24,
         'n': 25,
         'ng': 26,
         'nr': 27,
         'nrfg': 28,
         'nrt': 29,
         'ns': 30,
         'nt': 31,
         'nz': 32,
         'o': 33,
         'p': 34,
         'q': 35,
         'qe': 36,
         'qg': 37,
         'r': 38,
         'rg': 39,
         'rr': 40,
         'rz': 41,
         's': 42,
         't': 43,
         'tg': 44,
         'u': 45,
         'ud': 46,
         'ug': 47,
         'uj': 48,
         'ul': 49,
         'uv': 50,
         'uz': 51,
         'v': 52,
         'vd': 53,
         'vg': 54,
         'vi': 55,
         'vn': 56,
         'vq': 57,
         'w': 58,
         'x': 59,
         'y': 60,
         'yg': 61,
         'z': 62,
         'zg': 63,
         'a': 64,
         'ad': 65,
         'ag': 66,
         'an': 67,
         'b': 68,
         'bg': 69,
         'c': 70,
         'd': 71,
         'df': 72,
         'dg': 73,
         'e': 74,
         'en': 75,
         'f': 76,
         'g': 77,
         'h': 78,
         'i': 79,
         'in': 80,
         'j': 81,
         'jn': 82,
         'k': 83,
         'l': 84,
         'ln': 85,
         'm': 86,
         'mg': 87,
         'mq': 88,
         'n': 89,
         'ng': 90,
         'nr': 91,
         'nrfg': 92,
         'nrt': 93,
         'ns': 94,
         'nt': 95,
         'nz': 96,
         'o': 97,
         'p': 98,
         'q': 99,
         'qe': 100,
         'qg': 101,
         'r': 102,
         'rg': 103,
         'rr': 104,
         'rz': 105,
         's': 106,
         't': 107,
         'tg': 108,
         'u': 109,
         'ud': 110,
         'ug': 111,
         'uj': 112,
         'ul': 113,
         'uv': 114,
         'uz': 115,
         'v': 116,
         'vd': 117,
         'vg': 118,
         'vi': 119,
         'vn': 120,
         'vq': 121,
         'w': 122,
         'x': 123,
         'y': 124,
         'yg': 125,
         'z': 126,
         'zg': 127,
         'a': 128,
         'ad': 129,
         'ag': 130,
         'an': 131,
         'b': 132,
         'bg': 133,
         'c': 134,
         'd': 135,
         'df': 136,
         'dg': 137,
         'e': 138,
         'en': 139,
         'f': 140,
         'g': 141,
         'h': 142,
         'i': 143,
         'in': 144,
         'j': 145,
         'jn': 146,
         'k': 147,
         'l': 148,
         'ln': 149,
         'm': 150,
         'mg': 151,
         'mq': 152,
         'n': 153,
         'ng': 154,
         'nr': 155,
         'nrfg': 156,
         'nrt': 157,
         'ns': 158,
         'nt': 159,
         'nz': 160,
         'o': 161,
         'p': 162,
         'q': 163,
         'qe': 164,
         'qg': 165,
         'r': 166,
         'rg': 167,
         'rr': 168,
         'rz': 169,
         's': 170,
         't': 171,
         'tg': 172,
         'u': 173,
         'ud': 174,
         'ug': 175,
         'uj': 176,
         'ul': 177,
         'uv': 178,
         'uz': 179,
         'v': 180,
         'vd': 181,
         'vg': 182,
         'vi': 183,
         'vn': 184,
         'vq': 185,
         'w': 186,
         'x': 187,
         'y': 188,
         'yg': 189,
         'z': 190,
         'zg': 191,
         'a': 192,
         'ad': 193,
         'ag': 194,
         'an': 195,
         'b': 196,
         'bg': 197,
         'c': 198,
         'd': 199,
         'df': 200,
         'dg': 201,
         'e': 202,
         'en': 203,
         'f': 204,
         'g': 205,
         'h': 206,
         'i': 207,
         'in': 208,
         'j': 209,
         'jn': 210,
         'k': 211,
         'l': 212,
         'ln': 213,
         'm': 214,
         'mg': 215,
         'mq': 216,
         'n': 217,
         'ng': 218,
         'nr': 219,
         'nrfg': 220,
         'nrt': 221,
         'ns': 222,
         'nt': 223,
         'nz': 224,
         'o': 225,
         'p': 226,
         'q': 227,
         'qe': 228,
         'qg': 229,
         'r': 230,
         'rg': 231,
         'rr': 232,
         'rz': 233,
         's': 234,
         't': 235,
         'tg': 236,
         'u': 237,
         'ud': 238,
         'ug': 239,
         'uj': 240,
         'ul': 241,
         'uv': 242,
         'uz': 243,
         'v': 244,
         'vd': 245,
         'vg': 246,
         'vi': 247,
         'vn': 248,
         'vq': 249,
         'w': 250,
         'x': 251,
         'y': 252,
         'yg': 253,
         'z': 254,
         'zg': 255,
         'eng': 256
     }
     self.m = Merge(True, False)
     #self.m = Merge(True,True)
     pass
Ejemplo n.º 22
0
 def test_strings(self):
     lists = (("b", "e"), ("c", "d"), ("a", "z"))
     lists = [iter(l) for l in lists]
     merge = Merge(lists)
     generator = merge.generator()
     assert list(generator) == ['a', 'b', 'c', 'd', 'e', 'z']
Ejemplo n.º 23
0
# 100 days of code day 24: mail merge.
from merge import Merge

letter = "./Input/Letters/starting_letter.txt"
names = "./Input/Names/invited_names.txt"

print("Running mail merge\n")

merge = Merge(letter, names)

count = merge.merge()

print(f"Finished.  Processed {count} records.")

Ejemplo n.º 24
0
                                     password=SourceDBFrom["Password"])
filter_source_db_from.run()

filter_source_db_to = FilterPlayer(server_id=SourceDBTo["ServerID"],
                                   host=SourceDBTo["Host"],
                                   db=SourceDBTo["Database"],
                                   user=SourceDBTo["User"],
                                   password=SourceDBTo["Password"])
filter_source_db_to.run()

# 处理玩家重名
rename_player_proc = RenamePlayer(SourceDBFrom, SourceDBTo)
rename_player_proc.run()

# 公会数据处理
rename_guild_proc = RenameGuild(SourceDBFrom, SourceDBTo)
rename_guild_proc.run()

# 合并数据
merge_proc = Merge(SourceDBFrom, SourceDBTo, TargetDB)
merge_proc.run()

# 更新数据
update_proc = UpdateDB(SourceDBFrom, TargetDB)
update_proc.run()

end = time.time()
print "Elapsed %d seconds in total." % (end - begin)
print "Merge [%s] and [%s] to [%s] succeed." % (
    SourceDBFrom["Database"], SourceDBTo["Database"], TargetDB["Database"])
Ejemplo n.º 25
0
class Extract:
    def __init__(self, jieba=False):
        import re

        self.de = re.compile(u"[\u4e00-\u9fa5]")
        self.jieba = jieba
        self.relation = {
            u"fuqin": ("PERSON", "PERSON"),
            u"erzi": ("PERSON", "PERSON"),
            u"nver": ("PERSON", "PERSON"),
            u"nvyou": ("PERSON", "PERSON"),
            u"nanyou": ("PERSON", "PERSON"),
            u"muqin": ("PERSON", "PERSON"),
            u"emma": ("PERSON", "PERSON"),
            u"zhangfu": ("PERSON", "PERSON"),
            u"qizi": ("PERSON", "PERSON"),
            u"\u5973\u53cb": ("PERSON", "PERSON"),
            u"\u5973\u513f": ("PERSON", "PERSON"),
            u"\u59bb\u5b50": ("PERSON", "PERSON"),
            u"\u4e08\u592b": ("PERSON", "PERSON"),
            u"\u524d\u592b": ("PERSON", "PERSON"),
            u"\u7236\u4eb2": ("PERSON", "PERSON"),
            u"\u8eab\u9ad8": ("PERSON", "HEIGHT"),
            u"\u751f\u65e5": ("PERSON", "DATE"),
            u"\u64ad\u51fa\u65f6\u95f4": ("FILM", "TIME"),
            u"\u4e3b\u9898\u66f2": ("FILM", "MUSIC"),
        }
        self.pos_tagger = {
            "a": 0,
            "ad": 1,
            "ag": 2,
            "an": 3,
            "b": 4,
            "bg": 5,
            "c": 6,
            "d": 7,
            "df": 8,
            "dg": 9,
            "e": 10,
            "en": 11,
            "f": 12,
            "g": 13,
            "h": 14,
            "i": 15,
            "in": 16,
            "j": 17,
            "jn": 18,
            "k": 19,
            "l": 20,
            "ln": 21,
            "m": 22,
            "mg": 23,
            "mq": 24,
            "n": 25,
            "ng": 26,
            "nr": 27,
            "nrfg": 28,
            "nrt": 29,
            "ns": 30,
            "nt": 31,
            "nz": 32,
            "o": 33,
            "p": 34,
            "q": 35,
            "qe": 36,
            "qg": 37,
            "r": 38,
            "rg": 39,
            "rr": 40,
            "rz": 41,
            "s": 42,
            "t": 43,
            "tg": 44,
            "u": 45,
            "ud": 46,
            "ug": 47,
            "uj": 48,
            "ul": 49,
            "uv": 50,
            "uz": 51,
            "v": 52,
            "vd": 53,
            "vg": 54,
            "vi": 55,
            "vn": 56,
            "vq": 57,
            "w": 58,
            "x": 59,
            "y": 60,
            "yg": 61,
            "z": 62,
            "zg": 63,
            "a": 64,
            "ad": 65,
            "ag": 66,
            "an": 67,
            "b": 68,
            "bg": 69,
            "c": 70,
            "d": 71,
            "df": 72,
            "dg": 73,
            "e": 74,
            "en": 75,
            "f": 76,
            "g": 77,
            "h": 78,
            "i": 79,
            "in": 80,
            "j": 81,
            "jn": 82,
            "k": 83,
            "l": 84,
            "ln": 85,
            "m": 86,
            "mg": 87,
            "mq": 88,
            "n": 89,
            "ng": 90,
            "nr": 91,
            "nrfg": 92,
            "nrt": 93,
            "ns": 94,
            "nt": 95,
            "nz": 96,
            "o": 97,
            "p": 98,
            "q": 99,
            "qe": 100,
            "qg": 101,
            "r": 102,
            "rg": 103,
            "rr": 104,
            "rz": 105,
            "s": 106,
            "t": 107,
            "tg": 108,
            "u": 109,
            "ud": 110,
            "ug": 111,
            "uj": 112,
            "ul": 113,
            "uv": 114,
            "uz": 115,
            "v": 116,
            "vd": 117,
            "vg": 118,
            "vi": 119,
            "vn": 120,
            "vq": 121,
            "w": 122,
            "x": 123,
            "y": 124,
            "yg": 125,
            "z": 126,
            "zg": 127,
            "a": 128,
            "ad": 129,
            "ag": 130,
            "an": 131,
            "b": 132,
            "bg": 133,
            "c": 134,
            "d": 135,
            "df": 136,
            "dg": 137,
            "e": 138,
            "en": 139,
            "f": 140,
            "g": 141,
            "h": 142,
            "i": 143,
            "in": 144,
            "j": 145,
            "jn": 146,
            "k": 147,
            "l": 148,
            "ln": 149,
            "m": 150,
            "mg": 151,
            "mq": 152,
            "n": 153,
            "ng": 154,
            "nr": 155,
            "nrfg": 156,
            "nrt": 157,
            "ns": 158,
            "nt": 159,
            "nz": 160,
            "o": 161,
            "p": 162,
            "q": 163,
            "qe": 164,
            "qg": 165,
            "r": 166,
            "rg": 167,
            "rr": 168,
            "rz": 169,
            "s": 170,
            "t": 171,
            "tg": 172,
            "u": 173,
            "ud": 174,
            "ug": 175,
            "uj": 176,
            "ul": 177,
            "uv": 178,
            "uz": 179,
            "v": 180,
            "vd": 181,
            "vg": 182,
            "vi": 183,
            "vn": 184,
            "vq": 185,
            "w": 186,
            "x": 187,
            "y": 188,
            "yg": 189,
            "z": 190,
            "zg": 191,
            "a": 192,
            "ad": 193,
            "ag": 194,
            "an": 195,
            "b": 196,
            "bg": 197,
            "c": 198,
            "d": 199,
            "df": 200,
            "dg": 201,
            "e": 202,
            "en": 203,
            "f": 204,
            "g": 205,
            "h": 206,
            "i": 207,
            "in": 208,
            "j": 209,
            "jn": 210,
            "k": 211,
            "l": 212,
            "ln": 213,
            "m": 214,
            "mg": 215,
            "mq": 216,
            "n": 217,
            "ng": 218,
            "nr": 219,
            "nrfg": 220,
            "nrt": 221,
            "ns": 222,
            "nt": 223,
            "nz": 224,
            "o": 225,
            "p": 226,
            "q": 227,
            "qe": 228,
            "qg": 229,
            "r": 230,
            "rg": 231,
            "rr": 232,
            "rz": 233,
            "s": 234,
            "t": 235,
            "tg": 236,
            "u": 237,
            "ud": 238,
            "ug": 239,
            "uj": 240,
            "ul": 241,
            "uv": 242,
            "uz": 243,
            "v": 244,
            "vd": 245,
            "vg": 246,
            "vi": 247,
            "vn": 248,
            "vq": 249,
            "w": 250,
            "x": 251,
            "y": 252,
            "yg": 253,
            "z": 254,
            "zg": 255,
            "eng": 256,
        }
        self.m = Merge(True, False)
        # self.m = Merge(True,True)
        pass

        # get the ner using merge and search the relation's Ner

    def _process_data(self, lines, newwords, n2, tags=None):
        s = []
        p = []
        _seg = []
        _ner = []
        self.m.add_new_words(newwords)
        if n2 is not None:
            self.m.add_new_words(n2)
        for i in xrange(len(lines)):
            line = lines[i]
            (line_seg, line_pos, line_ner) = self.m.ner_using_nlpc(line)
            # (line_ner,line_pos,line_seg,line_dep) = self.m.get_line_info(line,False)
            if tags is not None:
                tag = tags[i]
                k = line_ner.count((self.relation[tag.decode("utf-8")])[1])
                if k == 0:
                    continue
                elif k == 1:
                    if (self.relation[tag.decode("utf-8")])[1] == (self.relation[tag.decode("utf-8")])[1]:
                        continue
            else:
                return
            seg = line_seg.split("\t")
            pos = line_pos.split("\t")
            ner = line_ner.split("\t")
            s.append(newwords[i][0].decode("utf-8"))
            p.append(tag)
            _seg.append(seg)
            _ner.append(ner)
        return (s, p, _seg, _ner)

    def statistics(self, newwords, tags, segs, ners):
        s = []
        p = []
        answer = []
        fromline = []
        for i in xrange(len(tags)):
            tag = tags[i]
            seg = segs[i]
            ner = ners[i]
            _a = []
            print " ".join(seg).encode("utf-8")
            for id in xrange(len(seg)):
                if tags is not None:
                    if ner[id] == (self.relation[tag.decode("utf-8")])[1]:
                        ll = len(self.de.findall(seg[id]))
                        if ll == 0:
                            ll = len(seg[id])
                        if (seg[id] != newwords[i]) and (seg[id] not in _a) and (ll > 1) and seg[id].isdigit() == False:
                            print newwords[i].encode("utf-8") + "," + tag.encode("utf-8") + "," + seg[id].encode(
                                "utf-8"
                            )
                            _a.append(seg[id])
                            answer.append(seg[id])
                            s.append(newwords[i])
                            fromline.append("".join(seg))
                            p.append(tag)
        dict = collections.OrderedDict()
        for i in xrange(len(s)):
            s[i] = s[i].decode("utf-8")
            spo = s[i] + p[i] + answer[i]
            if spo in dict:
                dict[spo][2] += 1
            else:
                dict[spo] = []
                dict[spo].append(s[i] + "\t" + p[i])
                dict[spo].append(answer[i])
                dict[spo].append(1)
                dict[spo].append(fromline[i])
                # result = {'sp':[[answer,count,line]]}
        result = collections.OrderedDict()
        for (k, v) in dict.items():
            sp = v[0]
            if sp in result:
                if v[2] > result[sp][0][1]:
                    result[sp] = []
                    ddd = []
                    ddd.append(v[1])
                    ddd.append(v[2])
                    ddd.append(v[3])
                    result[sp].append(ddd)
                elif v[2] == result[sp][0][1]:
                    ddd = []
                    ddd.append(v[1])
                    ddd.append(v[2])
                    ddd.append(v[3])
                    result[sp].append(ddd)
            else:
                result[sp] = []
                ddd = []
                ddd.append(v[1])
                ddd.append(v[2])
                ddd.append(v[3])
                result[sp].append(ddd)
        list = []
        for (k, v) in result.items():
            for i in xrange(len(v)):
                value = v[i]
                if value[1] == 1:
                    list.append(k + "\t" + value[0] + "\t" + "not sure" + "\t" + value[2])
                else:
                    list.append(k + "\t" + value[0] + "\t" + str(value[1]) + "\t" + value[2])
        return list

    def test3(self):
        lines = []
        tags = []
        newwords = []
        newwords2 = []
        for line in sys.stdin:
            try:
                line = line.split("\t")
                if len(line) < 5:
                    print "read wrong:" + "\t".join(line)
                    continue
                tags.append(line[1])
                newwords.append((line[0], (self.relation[line[1].decode("utf-8")])[0]))
                newwords2.append((line[2], (self.relation[line[1].decode("utf-8")])[0]))
                if line[4].strip() != "":
                    lines.append(line[4].strip())
                else:
                    print "read wrong:" + "\t".join(line)
            except:
                print "read wrong:" + "\t".join(line)
        (s, p, _seg, _ner) = self._process_data(lines, newwords, None, tags=tags)
        list = self.statistics(s, p, _seg, _ner)
        for l in list:
            print l.encode("utf-8")

    def test2(self):
        lines = []
        tags = []
        newwords = []
        newwords2 = []
        for line in sys.stdin:
            try:
                line = line.split("\t")
                if len(line) < 6:
                    print "read wrong:" + "\t".join(line)
                    continue
                tags.append(line[1])
                newwords.append((line[0], (self.relation[line[1].decode("utf-8")])[0]))
                newwords2.append((line[2], (self.relation[line[1].decode("utf-8")])[0]))
                if line[5].strip() != "":
                    lines.append(line[5].strip())
                else:
                    print "read wrong:" + "\t".join(line)
            except:
                print "read wrong:" + "\t".join(line)
        (s, p, _seg, _ner) = self._process_data(lines, newwords, newwords2, tags=tags)
        list = self.statistics(s, p, _seg, _ner)
        for l in list:
            print l.encode("utf-8")

    def test1(self):
        lines = []
        tags = []
        newwords = []
        newwords2 = []
        ss = ""
        pstr = ""
        anstr = ""
        check = False
        wf = open("result_fanhua_1", "ab")
        current = 0
        all = 0
        for line in sys.stdin:
            line = line.split("\t")
            if len(line) < 5:
                print "read wrong:" + "\t".join(line)
                continue
            if ss != line[0] and check:
                (s, p, _seg, _ner) = self._process_data(lines, newwords, newwords2, tags=tags)
                list = self.statistics(s, p, _seg, _ner)
                wf.write(ss + "\t" + pstr + "\t" + anstr + "\n")
                all += 1
                for l in list:
                    print "result" + l
                    wf.write(l + "\n\n")
                    if l.split("\t")[2] == anstr:
                        current += 1
                print current
                lines = []
                tags = []
                newwords = []
                newwords2 = []
            ss = line[0]
            pstr = line[1]
            anstr = line[2]
            tags.append(line[1])
            newwords.append((line[0], (self.relation[line[1].decode("utf-8")])[0]))
            newwords2.append((line[2], (self.relation[line[1].decode("utf-8")])[0]))
            if line[4].strip() != "":
                check = True
                lines.append(line[4].strip())
            else:
                print "read wrong:" + "\t".join(line)
        wf.write("all" + str(all))
        wf.write("current" + str(current))
        wf.close()

    def test(self):
        lines = []
        tags = []
        newwords = []
        for line in sys.stdin:
            try:
                line = line.split(" \t")
                tags.append(line[0].split("\t")[1])
                newwords.append((line[0].split("\t")[0], (self.relation[line[0].split("\t")[1].decode("utf-8")])[0]))
                lines.append(line[1].strip())
            except:
                print line
                quit()
        (s, p, answer) = self._process_data(lines, newwords, None, tags=tags)
        list = self.statistics(s, p, answer)
        for l in list:
            print l.encode("utf-8")
Ejemplo n.º 26
0
            "X: %.3f m" % self._cursor[0])
        textPainter.drawText(
            QPoint(ViewerWindow.MARGIN[0] / self._ratioX,
                   ViewerWindow.MARGIN[1] / self._ratioY + 260),
            "Y: %.3f m" % self._cursor[1])
        textPainter.drawText(
            QPoint(ViewerWindow.MARGIN[0] / self._ratioX,
                   ViewerWindow.MARGIN[1] / self._ratioY + 280),
            "Z: %.2f deg" % (self._cursor[2] / pi * 180))
        textPainter.drawText(
            QPoint(ViewerWindow.MARGIN[0] / self._ratioX,
                   ViewerWindow.MARGIN[1] / self._ratioY + 320), "速度")
        textPainter.drawText(
            QPoint(ViewerWindow.MARGIN[0] / self._ratioX,
                   ViewerWindow.MARGIN[1] / self._ratioY + 340),
            "X-Y: %.3f m/s" % sqrt(self._speed[0]**2 + self._speed[1]**2))
        textPainter.drawText(
            QPoint(ViewerWindow.MARGIN[0] / self._ratioX,
                   ViewerWindow.MARGIN[1] / self._ratioY + 360),
            "Z: %.2f deg/s" % (self._speed[2] / pi * 180))


if __name__ == '__main__':
    from omni import Omni
    from merge import Merge
    from dash import Dash
    omni = Omni()
    merge = Merge()
    dash = Dash(omni, merge)
    viewer = Viewer(dash)
Ejemplo n.º 27
0
def task_download_starts_avtar():
    m = Merge()
    m.download_starts_avtar(query={})
Ejemplo n.º 28
0
    第四部,清洗优酷
    '''
    '''
    3,category信息:
    从清洗好的内容库抽出来存到category库,偶caozuo
    
    '''

    t = time.time()
    # merge()
    # fixyoukuid()
    # fixletv()

    # merge_star()
    # merge_youkustar()
    # merge_letvstar()

    #merge_doubanvideo()
    # merge_letvvideo()
    # merge_youku_videos()

    # groupCategories()   #偶操作....

    # task_download_starts_avtar()
    # rm_unknown()
    # clean_category()
    # task_groupLanguageAndProducer_country()
    m = Merge()
    m.fix_youku_starId()
    print(time.time() - t)
Ejemplo n.º 29
0
def merge_star():
    m = Merge()
    m.merge_star()
Ejemplo n.º 30
0
#!/usr/bin/python3

from poll import Poll
from merge import Merge

pollThread = Poll("http://localhost:35673/devices")
pollThread.start()

mergeThread = Merge("http://localhost:35673")
mergeThread.start()
Ejemplo n.º 31
0
    def __init__(self, jieba=False):
        import re

        self.de = re.compile(u"[\u4e00-\u9fa5]")
        self.jieba = jieba
        self.relation = {
            u"fuqin": ("PERSON", "PERSON"),
            u"erzi": ("PERSON", "PERSON"),
            u"nver": ("PERSON", "PERSON"),
            u"nvyou": ("PERSON", "PERSON"),
            u"nanyou": ("PERSON", "PERSON"),
            u"muqin": ("PERSON", "PERSON"),
            u"emma": ("PERSON", "PERSON"),
            u"zhangfu": ("PERSON", "PERSON"),
            u"qizi": ("PERSON", "PERSON"),
            u"\u5973\u53cb": ("PERSON", "PERSON"),
            u"\u5973\u513f": ("PERSON", "PERSON"),
            u"\u59bb\u5b50": ("PERSON", "PERSON"),
            u"\u4e08\u592b": ("PERSON", "PERSON"),
            u"\u524d\u592b": ("PERSON", "PERSON"),
            u"\u7236\u4eb2": ("PERSON", "PERSON"),
            u"\u8eab\u9ad8": ("PERSON", "HEIGHT"),
            u"\u751f\u65e5": ("PERSON", "DATE"),
            u"\u64ad\u51fa\u65f6\u95f4": ("FILM", "TIME"),
            u"\u4e3b\u9898\u66f2": ("FILM", "MUSIC"),
        }
        self.pos_tagger = {
            "a": 0,
            "ad": 1,
            "ag": 2,
            "an": 3,
            "b": 4,
            "bg": 5,
            "c": 6,
            "d": 7,
            "df": 8,
            "dg": 9,
            "e": 10,
            "en": 11,
            "f": 12,
            "g": 13,
            "h": 14,
            "i": 15,
            "in": 16,
            "j": 17,
            "jn": 18,
            "k": 19,
            "l": 20,
            "ln": 21,
            "m": 22,
            "mg": 23,
            "mq": 24,
            "n": 25,
            "ng": 26,
            "nr": 27,
            "nrfg": 28,
            "nrt": 29,
            "ns": 30,
            "nt": 31,
            "nz": 32,
            "o": 33,
            "p": 34,
            "q": 35,
            "qe": 36,
            "qg": 37,
            "r": 38,
            "rg": 39,
            "rr": 40,
            "rz": 41,
            "s": 42,
            "t": 43,
            "tg": 44,
            "u": 45,
            "ud": 46,
            "ug": 47,
            "uj": 48,
            "ul": 49,
            "uv": 50,
            "uz": 51,
            "v": 52,
            "vd": 53,
            "vg": 54,
            "vi": 55,
            "vn": 56,
            "vq": 57,
            "w": 58,
            "x": 59,
            "y": 60,
            "yg": 61,
            "z": 62,
            "zg": 63,
            "a": 64,
            "ad": 65,
            "ag": 66,
            "an": 67,
            "b": 68,
            "bg": 69,
            "c": 70,
            "d": 71,
            "df": 72,
            "dg": 73,
            "e": 74,
            "en": 75,
            "f": 76,
            "g": 77,
            "h": 78,
            "i": 79,
            "in": 80,
            "j": 81,
            "jn": 82,
            "k": 83,
            "l": 84,
            "ln": 85,
            "m": 86,
            "mg": 87,
            "mq": 88,
            "n": 89,
            "ng": 90,
            "nr": 91,
            "nrfg": 92,
            "nrt": 93,
            "ns": 94,
            "nt": 95,
            "nz": 96,
            "o": 97,
            "p": 98,
            "q": 99,
            "qe": 100,
            "qg": 101,
            "r": 102,
            "rg": 103,
            "rr": 104,
            "rz": 105,
            "s": 106,
            "t": 107,
            "tg": 108,
            "u": 109,
            "ud": 110,
            "ug": 111,
            "uj": 112,
            "ul": 113,
            "uv": 114,
            "uz": 115,
            "v": 116,
            "vd": 117,
            "vg": 118,
            "vi": 119,
            "vn": 120,
            "vq": 121,
            "w": 122,
            "x": 123,
            "y": 124,
            "yg": 125,
            "z": 126,
            "zg": 127,
            "a": 128,
            "ad": 129,
            "ag": 130,
            "an": 131,
            "b": 132,
            "bg": 133,
            "c": 134,
            "d": 135,
            "df": 136,
            "dg": 137,
            "e": 138,
            "en": 139,
            "f": 140,
            "g": 141,
            "h": 142,
            "i": 143,
            "in": 144,
            "j": 145,
            "jn": 146,
            "k": 147,
            "l": 148,
            "ln": 149,
            "m": 150,
            "mg": 151,
            "mq": 152,
            "n": 153,
            "ng": 154,
            "nr": 155,
            "nrfg": 156,
            "nrt": 157,
            "ns": 158,
            "nt": 159,
            "nz": 160,
            "o": 161,
            "p": 162,
            "q": 163,
            "qe": 164,
            "qg": 165,
            "r": 166,
            "rg": 167,
            "rr": 168,
            "rz": 169,
            "s": 170,
            "t": 171,
            "tg": 172,
            "u": 173,
            "ud": 174,
            "ug": 175,
            "uj": 176,
            "ul": 177,
            "uv": 178,
            "uz": 179,
            "v": 180,
            "vd": 181,
            "vg": 182,
            "vi": 183,
            "vn": 184,
            "vq": 185,
            "w": 186,
            "x": 187,
            "y": 188,
            "yg": 189,
            "z": 190,
            "zg": 191,
            "a": 192,
            "ad": 193,
            "ag": 194,
            "an": 195,
            "b": 196,
            "bg": 197,
            "c": 198,
            "d": 199,
            "df": 200,
            "dg": 201,
            "e": 202,
            "en": 203,
            "f": 204,
            "g": 205,
            "h": 206,
            "i": 207,
            "in": 208,
            "j": 209,
            "jn": 210,
            "k": 211,
            "l": 212,
            "ln": 213,
            "m": 214,
            "mg": 215,
            "mq": 216,
            "n": 217,
            "ng": 218,
            "nr": 219,
            "nrfg": 220,
            "nrt": 221,
            "ns": 222,
            "nt": 223,
            "nz": 224,
            "o": 225,
            "p": 226,
            "q": 227,
            "qe": 228,
            "qg": 229,
            "r": 230,
            "rg": 231,
            "rr": 232,
            "rz": 233,
            "s": 234,
            "t": 235,
            "tg": 236,
            "u": 237,
            "ud": 238,
            "ug": 239,
            "uj": 240,
            "ul": 241,
            "uv": 242,
            "uz": 243,
            "v": 244,
            "vd": 245,
            "vg": 246,
            "vi": 247,
            "vn": 248,
            "vq": 249,
            "w": 250,
            "x": 251,
            "y": 252,
            "yg": 253,
            "z": 254,
            "zg": 255,
            "eng": 256,
        }
        self.m = Merge(True, False)
        # self.m = Merge(True,True)
        pass
Ejemplo n.º 32
0
if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="prepare_for_neuronet")
    parser.add_argument('--input', help='Input directory of bunches (annotation files)')
    args = parser.parse_args()

    input_dir = args.input

    input_dir, output_dir = Utils.init_paths_neuroner(input_dir)
    annotators = ['eugenia', 'victoria', 'isabel', 'carmen']

    variable_dict, variable_hash_dict, section_dict = Entities.get_final_annotators_entities(
                                                                                    input_dir,
                                                                                    output_dir,
                                                                                    t_number=False)

    merged_variables, _ = Merge.merge_entities(variable_dict)
    merged_sections, _ = Merge.merge_entities(section_dict)
    merged_variables_hash = Merge.merge_hash(variable_hash_dict)

    section_variable = Merge.merge_variables_sections(merged_variables, merged_sections)

    Write.accepted_variables_neuroner(section_variable, merged_variables_hash, output_dir)

    print("Done")






Ejemplo n.º 33
0
if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="re_annotation")
    parser.add_argument('--bunch', help='Which set is going to compare')
    args = parser.parse_args()

    bunch = args.bunch

    input_dir, output_dir = Utils.init_paths()
    annotators = Utils.annators_name(input_dir)

    variable_dict, variable_hash_dict, section_dict= Entities.get_annotators_entities(bunch,
                                                                                    annotators,
                                                                                    input_dir,
                                                                                    t_number=False)

    merged_variables, owner_file = Merge.merge_entities(variable_dict)
    merged_variables = Entities.sorted_entities(merged_variables)

    merged_sections, _ = Merge.merge_entities(section_dict)
    merged_variables_hash = Merge.merge_hash(variable_hash_dict)

    ctakes_dir = input_dir.replace("input", "ctakes_output")
    ctakes_variables, ctakes_variables_hash, ctakes_sections = Entities.get_ctakes_entities(bunch,
                                                                                            ctakes_dir,
                                                                                            t_number=False)

    merged_variables, merged_variables_hash, merged_sections = Merge.merge_ctakes_annotators(merged_variables,
                                                                            merged_variables_hash,
                                                                            merged_sections,
                                                                            ctakes_variables,
                                                                            ctakes_variables_hash,
Ejemplo n.º 34
0
 def act_merge(self):
     merge = Merge(tmpdir=self.processed_dir, section_names=self.sections)
     merge.process(self.filepath_out)
     opendocument(self.filepath_out)