コード例 #1
0
    def test_64bit_symbolsize(self):
        """
        Asserts that SymbolSizeExceptions are raised for invalid symbol sizes
        on 64 bit systems

        A good symbolsize for 64 bit is in increments of 8 BYTES
        """

        #save the original
        original = config._64BIT

        try:
            config._64BIT = True

            # Try a bad symbolsize.
            with self.assertRaises(SymbolSizeException):
                c = Chunker(10, 7)

            try:
                # Try a good symbol size (exactly 8 BYTES)
                c = Chunker(10, 8)

                # Try a multiple of 8BYTES
                c = Chunker(10, 24)
            except Exception, e:
                self.fail(e.tostring())

        finally:
            # Restore original
            config._64BIT = original
コード例 #2
0
ファイル: parse.py プロジェクト: Deadlyelder/BlockPerf
    def execute(self):
        self._pool = Pool(config.pool_processors)
        # We need to generate blank files before writing them
        # Python doesnt allow those at execution time..?
        for parser in instance_parsers + node_parsers:
            write.write_header_csv(parser.file_name, parser.csv_header)
        logging.info('Created blank csv')

        self._pool.starmap(_parse, zip(
            repeat(self._writer,),
            repeat(config.run_log),
            repeat('test'),
            Chunker.chunkify(config.run_log, config.file_chunk_size),
            repeat(instance_parsers),
        ))

        for node in self._context.nodes.values():
            self._pool.starmap(_parse, zip(
                repeat(self._writer),
                repeat(node.get_log_file()),
                repeat(node.name),
                Chunker.chunkify(node.get_log_file(), config.file_chunk_size),
                repeat(node_parsers),
            ))

        self._pool.close()
        logging.info('Parsing done')
コード例 #3
0
ファイル: parse.py プロジェクト: andreaskern/simcoin
    def execute(self):
        self._pool = Pool(config.pool_processors)

        for parser in host_parsers + node_parsers:
            self._writer.write_header_csv(parser.file_name, parser.csv_header)
        logging.info('Created all empty csv files')

        self._pool.starmap(
            _parse,
            zip(
                repeat(self._writer, ),
                repeat(config.run_log),
                repeat('simcoin'),
                Chunker.chunkify(config.run_log, config.file_chunk_size),
                repeat(host_parsers),
            ))

        for node in self._context.nodes.values():
            self._pool.starmap(
                _parse,
                zip(
                    repeat(self._writer),
                    repeat(node.get_log_file()),
                    repeat(node.name),
                    Chunker.chunkify(node.get_log_file(),
                                     config.file_chunk_size),
                    repeat(node_parsers),
                ))

        self._pool.close()
        logging.info('Finished parsing of run_log and all node logs')
コード例 #4
0
    def __init__(self, *args, **kwargs):
        self.drives = []
        self.drive_index = 0

        self.got_ack = False
        self.chunker = Chunker(server=True)

        self.ws = None

        super().__init__(*args, **kwargs)
コード例 #5
0
ファイル: ml100k.py プロジェクト: Jelaque/Topics-on-database
    def loadMovieLensParallel(self):
        self.data = {}

        #init objects
        pool = mp.Pool(self.cores)
        jobs = []

        #create jobs
        fname = self.path + "u.data"
        for chunkStart, chunkSize in Chunker.chunkify(fname):
            jobs.append(
                pool.apply_async(self.process_data,
                                 (fname, chunkStart, chunkSize)))

        #wait for all jobs to finish
        for job in jobs:
            job.get()
        #clean up
        pool.close()

        #init objects
        pool = mp.Pool(self.cores)
        jobs = []
        fname = self.path + "u.item"
        for chunkStart, chunkSize in Chunker.chunkify(fname):
            jobs.append(
                pool.apply_async(self.process_item,
                                 (fname, chunkStart, chunkSize)))

        #wait for all jobs to finish
        for job in jobs:
            job.get()
        #clean up
        pool.close()

        #init objects
        pool = mp.Pool(self.cores)
        jobs = []
        fname = self.path + "u.user"
        for chunkStart, chunkSize in Chunker.chunkify(fname):
            jobs.append(
                pool.apply_async(self.process_users,
                                 (fname, chunkStart, chunkSize)))

        #wait for all jobs to finish
        for job in jobs:
            job.get()

        #clean up
        pool.close()
コード例 #6
0
    def test_block_ids(self):
        """
        Tests the generation of block ids
        The first should be 0
        The second should be 0 + 1
        """
        c = Chunker(DEFAULT_K, DEFAULT_SYMBOLSIZE)
        expected_block_id = 0

        # Check that first blockid = 0
        self.assertEqual(c.get_block_id(), expected_block_id)

        # Check that the next block id = 1
        expected_block_id = 1
        self.assertEqual(c.get_block_id(), expected_block_id)
コード例 #7
0
ファイル: parse.py プロジェクト: Deadlyelder/BlockPerf
def _parse(writer, log_file, name, chunk, parsers):
    parsed_objects = {}
    for line in Chunker.parse(Chunker.read(log_file, chunk)):
        for parser in parsers:
            try:
                parsed_object = parser.from_log_line(line, name)
                parsed_objects.setdefault(parsed_object.file_name, []).append(parsed_object)
                break
            except ParseException:
                pass

    for key in parsed_objects:
        writer.append_csv(key, parsed_objects[key])
    logging.info('Parsing the type of tick {}'
                 .format(len(parsed_objects)))
コード例 #8
0
    def test_block_ids(self):
        """
        Tests the generation of block ids
        The first should be 0
        The second should be 0 + 1
        """
        c = Chunker(DEFAULT_K, DEFAULT_SYMBOLSIZE)
        expected_block_id = 0

        # Check that first blockid = 0
        self.assertEqual(c.get_block_id(), expected_block_id)

        # Check that the next block id = 1
        expected_block_id = 1
        self.assertEqual(c.get_block_id(), expected_block_id)
コード例 #9
0
    def scores(self, viterbi_sequences, id_to_tag_dict, validation_labels,
               batch):
        # Get performance of model: precision, recall, f1-score
        found_viterbi = 0
        correct_viterbi = 0
        found_y = 0  # i.e. correct_y
        for seq in range(len(viterbi_sequences)):
            viterbi_tag_idxs = viterbi_sequences[seq]
            viterbi_tags = [
                id_to_tag_dict[i] for i in np.asarray(viterbi_tag_idxs)
            ]
            chunker = Chunker()
            viterbi_entities = chunker.extract_named_entities(viterbi_tags)

            y_tag_idxs = validation_labels[batch][seq][:len(viterbi_tag_idxs)]
            y_tags = [id_to_tag_dict[j] for j in y_tag_idxs]
            y_entities = chunker.extract_named_entities(y_tags)

            found_viterbi += len(viterbi_entities)
            found_y += len(y_entities)
            for ner in viterbi_entities:
                if ner in y_entities:
                    correct_viterbi += 1

        if found_viterbi > 0:
            print("Found (correct) " + str(correct_viterbi))
            print("Found (guessed) " + str(found_viterbi))
            self.precision = 100.00 * (correct_viterbi / found_viterbi)
            print("Precision: " + str(self.precision))
        else:
            print("Precision 0. No named entities guessed")

        if found_y > 0:
            print("Found (correct) " + str(correct_viterbi))
            print("Actual correct " + str(found_y))
            self.recall = 100.00 * (correct_viterbi / found_y)
            print("Recall: " + str(self.recall))
        else:
            print("Recall 0. No named entities in gold standard.")

        if (self.precision > 0) and (self.recall > 0):
            self.f1_score = 2.0 * self.precision * self.recall / (
                self.precision + self.recall)
            print("F1 score: " + str(self.f1_score))
        else:
            print("F1 score 0. Precision and recall are 0.")
コード例 #10
0
ファイル: test_chunker.py プロジェクト: longtoa/Locations
    def test_small_chunk(self):
        new_lijst = [i for i in range(20)]
        new_lijst_chunker = Chunker(new_lijst)

        new_lijst50 = list(new_lijst_chunker(50))

        self.assertEqual(len(new_lijst50), 1)
        self.assertEqual(new_lijst50[0][19], 19)
コード例 #11
0
ファイル: test_chunker.py プロジェクト: longtoa/Locations
    def test_even_chunk(self):
        lijst = [i for i in range(300)]
        lijst_chunker = Chunker(lijst)

        lijst50 = list(lijst_chunker(50))

        self.assertEqual(len(lijst50[0]), 50)
        self.assertEqual(lijst50[1][0], 50)
        self.assertEqual(len(lijst50), 6)
        self.assertEqual(lijst50[5][49], 299)
コード例 #12
0
ファイル: test_chunker.py プロジェクト: longtoa/Locations
    def test_odd_chunk(self):
        lijst = [i for i in range(253)]
        lijst_chunker = Chunker(lijst)

        lijst47 = list(lijst_chunker(47))

        self.assertEqual(len(lijst47[0]), 47)
        self.assertEqual(lijst47[1][0], 47)
        self.assertEqual(len(lijst47), 6)
        self.assertEqual(lijst47[5][17], 252)
コード例 #13
0
 def test_32bit_init(self):
     """
     Tests the initialization of the chunker on 32 bit systems
     """
     original = config._64BIT
     config._64BIT = False
     try:
         c = Chunker(DEFAULT_K, DEFAULT_SYMBOLSIZE)
         self.assertEqual(c.dtype, "uint32")
         self.assertEqual(c.bytes_per_int, 4)
     finally:
         config._64BIT = original
コード例 #14
0
    def readMovies(self, path='/ml-25m'):
        self.data = {}

        #init objects
        pool = mp.Pool(8)
        jobs = []

        #create jobs
        fname = self.path + path + "/ratings.csv"
        for chunkStart, chunkSize in Chunker.chunkify(fname):
            jobs.append(
                pool.apply_async(self.process_data,
                                 (fname, chunkStart, chunkSize)))

        for job in jobs:
            job.get()
        #clean up
        pool.close()
コード例 #15
0
    def test_chunker_init(self):
        """
        Tests the basic initialization of the chunker class
        """
        expected_blocksize = DEFAULT_SYMBOLSIZE * DEFAULT_K
        expected_block_id = 0
        c = Chunker(DEFAULT_K, DEFAULT_SYMBOLSIZE)

        # Check k
        self.assertEqual(c.k, DEFAULT_K)

        # Check Symbolsize
        self.assertEqual(c.symbolsize, DEFAULT_SYMBOLSIZE)

        # Check blocksize
        self.assertEqual(c.blocksize, expected_blocksize)

        # Check expected block id
        self.assertEqual(c.block_id, expected_block_id)
コード例 #16
0
def simulator_evaluate_all(file, evaluation_list=None):
    if evaluation_list is None:
        evaluation_list = []
    chunker = Chunker(file)
    for each_chunk in chunker:
        simulator_jevaluate(each_chunk, evaluation_list)
コード例 #17
0
from chunker import Chunker

ch = Chunker()
kalimat = "Richard Winger , rekan di Boston Consulting Group , menambahkan : Belakangan ini , sangat populer jika menghias diri anda dengan bendera ."
print(ch.tree_to_str(ch.chunk_me1(kalimat)))
print(ch.tree_to_str(ch.chunk_me2(kalimat)))
print(ch.tree_to_str(ch.chunk_me3(kalimat)))
コード例 #18
0
def parse():
    "parse POSTed Korean sentence"
    # grab sentence to parse
    sentence = request.form.get('sentence')
    if not sentence:
        return jsonify(result="FAIL", msg="Missing sentence")

    # build a string for the KHaiii phoneme analyzer
    if sentence.strip()[-1] not in ['.', '?', '!']:
        sentence += '.'

    # run Khaiii
    words = []
    for w in khaiiiAPI.analyze(sentence):
        for m in w.morphs:
            words.append('{0}:{1}'.format(m.lex.strip(), m.tag))
    posString = ';'.join(words)

    # map POS through synthetic tag mapper
    mappedPosList = TagMap.mapTags(posString)

    # perform chunk parsing
    chunkTree = Chunker.parse(mappedPosList)

    # apply any synthetic-tag-related node renamings
    TagMap.mapNodeNames(chunkTree)

    # build descriptive phrase list
    phrases = Chunker.phraseList(chunkTree)

    # recursively turn the chunk tree into a Python nested dict for the JSON response
    def asDict(chunk):
        while isinstance(chunk, nltk.Tree) and len(chunk) == 1:
            # flatten degenerate tree nodes
            chunk = chunk[0]
        if isinstance(chunk, nltk.Tree):
            return dict(
                type='tree',
                tag='Sentence' if chunk.label() == 'S' else chunk.label(),
                children=[asDict(t) for t in chunk])
        else:
            return dict(type='pos', word=chunk[0].strip(), tag=chunk[1])

    #
    parseTree = asDict(chunkTree)
    debugging = dict(posList=pformat(words),
                     mappedPosList=pformat(mappedPosList),
                     phrases=pformat(phrases),
                     parseTree=pformat(parseTree))

    return jsonify(result="OK",
                   posList=words,
                   mappedPosList=mappedPosList,
                   phrases=phrases,
                   parseTree=parseTree,
                   debugging=debugging)

    if False:

        # synthetic tag patterns -
        #    patterns of these word:POC strings are preprocessed to define new
        #    synthetic word:POC tags used in the chunking grammar below
        #  at present, these are applied in the order longest-to-shortest pattern, we should probably make this a listfor explicit ordering

        tagMappings = {
            r'들:(TM|XSN)': r'들:PLU',  # pluralizer
            r'기:(ETN|NNG)': r'기:GNOM',  # nominalizer
            r'(ㄴ|는|ㄹ):ETM;것:NNB': r'\1 것:GNOM',  # nominalizer
            r'(은|는):JX':
            r'\1:JKS',  # turn topic-marking partcile into subject-marker (I think this is right??)
            r'(ㄹ|을|를):ETM;거:NNB;이:VCP':
            r'\1 거 이:FUT',  # ㄹ/를 거 이다 future-tense conjugator (hack!)
            r'전:NNG;에:JKB': r'전에:BEF',  # before
            r'때문:NNB;에:JKB': r'때문에:BEC',  # because
            r'및:MAG': r'및:ALS',  # also connector (why is it an adverb??)
            r'또는:MAG':
            r'또는:ALT',  # alternative connector (why is it an adverb??)
            r'에:JKB;(대하|관하):VV;([^:]+):EC':
            r'에 \1\2:PRP',  # preposition "about
        }

        # prepositional phrase suffix-patterns  (generate a <PRPP> pos-tag + metadata to label the parsing

        #    tag-pattern                  replacement          subtree name-mapping             reference links
        # (r'전:NNG;에:JKB',              r'전에:PRP',      "PrepositionalPhrase:Before",  ("ttmik:lessons/level-3-lesson-10", "htsk:unit1/unit-1-lessons-17-25-2/lesson-24/#242"))     # before

        # generate a version of the parser's original word:POC string including synthetic tag mappings above
        tagString = ';' + posString + ';'
        for old, new in sorted(tagMappings.items(),
                               key=lambda x: len(x[0]),
                               reverse=True):
            tagString = re.sub(';' + old + ';', ';' + new + ';', tagString)
        mappedWords = [
            tuple(pos.split(':')) for pos in tagString.strip(';').split(';')
        ]

        # Korean phrase NLTK chunking grammar

        grammar = r"""
    
            HadaVerb:           {<NN.*><XSV>}
            AuxiliaryVerb:      {<EC><VX|VV>}
            Adverb:             {<MAG>}
            NominalizedVerb:    {<VV|HadaVerb><EP|FUT>*<GNOM>}
            Adjective:          {<Adverb>*<VA><ETM>}
            DescriptiveVerb:    {<VA>}
            Verb:               {<VV|VCN|HadaVerb|DescriptiveVerb>}
            VerbSuffix:         {<EP|FUT>*<EF|EC>}
    
            Location:           {<JKB>}
            Title:              {<XSN>}
        
            Preposition:        {<PRP>}
            
            Noun:               {<NN.*|NR|SL>}       
            Pronoun:            {<NP>}
            Substantive:        {<Noun><Noun>*}
                                {<Pronoun>}
                                {<NominalizedVerb>}            
            NounPhrase:         {<XPN>*<MAG>*<Adjective>*<Substantive><Title>*<Location>*<PLU>*<JX>*<Preposition>*}
            
            Possessive:         {<NounPhrase><JKG><NounPhrase>}
            Component:          {<NounPhrase|Possessive><JC|ALS|ALT>}
            Connection:         {<Component><Component>*<NounPhrase|Possessive>}
            
            Constituent:        {<NounPhrase|Possessive|Connection>}
        
            Complement:         {<Constituent><JKC>} 
            Object:             {<Constituent><JKO>}  
            Subject:            {<Constituent><JKS>}
            
            Before:             {<Constituent|Object|Subject>*<Constituent><BEF>}
            Because:            {<Constituent|Object|Subject>*<Constituent><BEC>}
            
            Copula:             {<Constituent><Adverb>*<VCP><AuxiliaryVerb>*<VerbSuffix>}
            Predicate:          {<Adverb>*<Verb><AuxiliaryVerb>*<VerbSuffix>}
    
            """

        # Component: { < NounPhrase > < JC | ALS > }
        # Connection: { < Component > < Component > * < NounPhrase > }
        # Possessive: { < NounPhrase | Connection > < JKG > < NounPhrase | Connection > }

        # Constituent:        {<Subject|Object|Complement>}
        # Clause:             {<Constituent>*<Predicate>}
        # Sentence:           {<Clause><Clause>*}

        # gen chunk tree from the word-POS list under the above chunking grammar
        parser = nltk.RegexpParser(grammar, trace=1)
        print(parser._stages)
        chunkTree = parser.parse(mappedWords)

        # heuristic subtree simplifications
        # toss sentence end node
        if not isinstance(chunkTree[-1],
                          nltk.Tree) and chunkTree[-1][1] == 'SF':
            chunkTree.remove(chunkTree[-1])
        # flatten connection trees
        def flattenConnections(t):
            for st in t:
                if isinstance(st, nltk.Tree):
                    if st.label() == 'Connection':
                        # if Connection node, pull up component tuples into one long connection sequence
                        for i, c in enumerate(list(st)[:-1]):
                            st[2 * i] = c[0]
                            st.insert(2 * i + 1, c[1])
                    else:
                        flattenConnections(st)

        flattenConnections(chunkTree)

        # generate phrase-descriptors from top-level subtrees
        hiddenTags = {
            'Substantive',
            'Constituent',
            'NounPhrase',
            'Connection',
        }

        def flattenPhrase(t, phrase):
            for st in t:
                if isinstance(st, nltk.Tree):
                    phrase = flattenPhrase(st, phrase)
                    if st.label() not in hiddenTags:
                        phrase.append({"type": 'label', "word": st.label()})
                else:
                    phrase.append({
                        "type": 'word',
                        "word": st[0].strip(),
                        "tag": st[1]
                    })  # st[1][0] if st[1][0] in ('N', 'V') else st[0].strip()
            return phrase

        #
        phrases = []
        for t in chunkTree:
            if isinstance(t, nltk.Tree):
                phrase = flattenPhrase(t, [])
                if t.label() not in hiddenTags:
                    phrase.append({"type": 'label', "word": t.label()})
                phrases.append(phrase)
            else:
                phrases.append(('word', t[0].strip()))
        for p in phrases:
            print(p)

        # recursively turn the chunk tree into a Python nested dict for the JSON response
        def asDict(chunk):
            while isinstance(chunk, nltk.Tree) and len(chunk) == 1:
                # flatten degenerate tree nodes
                chunk = chunk[0]
            if isinstance(chunk, nltk.Tree):
                return dict(
                    type='tree',
                    tag='Sentence' if chunk.label() == 'S' else chunk.label(),
                    children=[asDict(t) for t in chunk])
            else:
                return dict(type='pos', word=chunk[0].strip(), tag=chunk[1])

        #
        parseTree = asDict(chunkTree)
        debugging = dict(posList=pformat(words),
                         mappedPosList=pformat(mappedWords),
                         phrases=pformat(phrases),
                         parseTree=pformat(parseTree))

        return jsonify(result="OK",
                       posList=words,
                       mappedPosList=mappedWords,
                       phrases=phrases,
                       parseTree=parseTree,
                       debugging=debugging)
コード例 #19
0
ファイル: test_chunker.py プロジェクト: longtoa/Locations
    def test_not_list_passed(self):

        with self.assertRaises(TypeError):
            dict = {'test': 15, 'test2': 16}
            Chunker(dict)
コード例 #20
0
def test_chunker(reference_chunks):
    with open('bootstrap.utf.txt', encoding='utf-8') as bootstrap_file:
        chunker = Chunker(bootstrap_file)
        for index, each_chunk in enumerate(chunker):
            reference_chunk = reference_chunks[index]
            assert each_chunk == reference_chunk
コード例 #21
0
ファイル: client.py プロジェクト: wcpun/DFS
def put(user_id, fileName):
    # read data from input file
    with open(fileName, 'rb') as fin:
        bytes = fin.read(BYTE_SIZE)
        fileSize = len(bytes)
        logging.debug("Bytes read with length {0}.".format(fileSize))

    # begin chunking
    chunkObj = Chunker()
    cuts = chunkObj.chunking(bytes)
    logging.debug("Num of Cuts: {0}".format(len(cuts)))
    # for index, chunk in enumerate(chunkObj.divide_chunk(bytes, cuts)):
    #     logging.debug("Chunk {0}: {1}".format(index, chunk))
    logging.info("Chunking completed.\n")

    # create meta data
    fid = fileName + str(user_id)
    cryptObj = Crypto(fid, 'client')
    iv = cryptObj.gen_iv(bytes)
    metaObj = Meta(user_id, fileName, fileSize, iv)
    logging.debug("FileName: {0}".format(metaObj.fileName))
    logging.debug("IV: {0}\n".format(iv))

    # gen keys
    try:
        keys = []
        with con("localhost", 18862) as keyserver:
            if keyserver.root.isExist(fid):
                logging.error("File Already Exist.")
                exit(1)
            for i, chunk in enumerate(chunkObj.divide_chunk(bytes, cuts)):
                key = keyserver.root.gen_key(fid, chunk)
                if not key:
                    logging.error("Key is empty.")
                    exit(1)
                keys.append(key)
                logging.debug("Key {0}: {1}".format(i, key))
    except:
        logging.error("Cannot generate key.")
        exit(1)
    logging.info("Key generation completed.\n")

    # encrypt chunks
    ciphers = []
    count = 0
    for chunk, key in zip(chunkObj.divide_chunk(bytes, cuts), keys):
        cipher = cryptObj.encode(chunk, key, iv)
        ciphers.append(cipher)
        # logging.debug("Cipher {0}: {1}".format(count, cipher))
        count += 1
    del bytes
    del keys
    logging.info("Encryption completed.\n")

    # send ciphers to server
    try:
        with con("localhost", 18861) as master:
            if master.root.put(metaObj):
                for cipher in ciphers:
                    master.root.put(metaObj, cipher)
                    logging.info("Chunk with size {0} uploaded.".format(
                        len(cipher)))
            else:
                logging.error("File Already Exist.")
                exit(1)
    except:
        logging.error("Cannot upload file.")
        exit(1)
import os
from flask import Flask, request, json, render_template, Response
from flask_cors import CORS
from chunker import Chunker

ch = Chunker()

app = Flask(__name__)
CORS(app)

@app.route('/')
def hello_world():
   return 'Hello World'

@app.route('/chunk', methods = ['POST'])
def chunk():
	data_all = request.get_json()

	kalimat = data_all['kalimat']
	return str(ch.tree_to_str(ch.chunk_me2(kalimat)))

	# _ists = iSTSEngine()
	# return _ists.singleTest()

if __name__ == '__main__':
   app.run(host="0.0.0.0", port=5000)
コード例 #23
0
        '-n',
        '--n-workers',
        type=int,
        default=DEFAULT_WORKERS,
        help=f"number of workers to use (default depends on your system)")
    parser.add_argument(
        '--spacy-langs',
        nargs='+',
        default=['de', 'en'],
        help='spaCy model language names (in order) to use for lemmatization')
    parser.add_argument(
        '--outdir',
        default=None,
        help='directory to put outputs (if different original file locations)')
    parser.add_argument(
        '--logdir',
        default=None,
        help='directory to put logs (default will be current working dir)')
    args = parser.parse_args()

    args = vars(args)
    fps = args.pop('fps')
    workers = args.pop('n_workers')
    b_size = args.pop('batch_size')
    max_tasks = args.pop('max_tasks_per_child')
    outdir = args.pop('outdir')

    representer = ParallelSpacy(**args)
    representer.chunker = Chunker(fps, b_size)
    representer.process(fps, workers, max_tasks, outdir=outdir)
コード例 #24
0
def parseInput(input,
               parser="RD",
               showAllLevels=False,
               getWordDefinitions=True):
    "parse input string into list of parsed contained sentence structures"
    # parser can be RD for recusrsive descent (currently the most-developed) or "NLTK" for the original NLTK chunking-grammar parser

    # clean & build a string for the KHaiii phoneme analyzer
    input = input.strip()
    if input[-1] not in ['.', '?', '!']:
        input += '.'
    input = re.sub(
        r'\s+([\.\?\;\,\:])', r'\1',
        input)  # elide spaces preceding clause endings, throws Khaiii off
    # input = input.replace(',', ' , ').replace(';', ' ; ').replace(':', ' : ') - adding a space before punctuation seems to mess tagging in Khaiii
    log("* parse {0}".format(input))

    # run Khaiii, grab the parts-of-speech list it generates (morphemes + POS tags) and extract original word-to-morpheme groupings
    sentences = []  # handle possible multiple sentences
    posList = []
    morphemeGroups = []
    for w in khaiiiAPI.analyze(input):
        morphemeGroups.append(
            [w.lex, [m.lex for m in w.morphs if m.tag != 'SF']])
        for m in w.morphs:
            posList.append('{0}:{1}'.format(m.lex.strip(), m.tag))
            if m.tag == 'SF':
                # sentence end, store extractions & reset for possible next sentence
                sentences.append(
                    dict(posList=posList,
                         morphemeGroups=morphemeGroups,
                         posString=';'.join(posList)))
                posList = []
                morphemeGroups = []

    for s in sentences:
        # map POS through synthetic tag mapper & extract word groupings
        mappedPosList, morphemeGroups = TagMap.mapTags(
            s['posString'], s['morphemeGroups'])  #, disableMapping=True)
        log("  {0}".format(s['posString']))
        log("  mapped to {0}".format(mappedPosList))

        if parser == "NLTK":  # NLTK chunking parser
            # perform chunk parsing
            chunkTree = Chunker.parse(mappedPosList, trace=2)
            chunkTree.pprint()
            # apply any synthetic-tag-related node renamings
            TagMap.mapNodeNames(chunkTree)
            # extract popup wiki definitions & references links & notes for implicated nodes
            references = TagMap.getReferences(chunkTree)
            # build descriptive phrase list
            phrases = Chunker.phraseList(chunkTree)
            #
            parseTreeDict = buildParseTree(chunkTree,
                                           showAllLevels=showAllLevels)

        else:  # recursive-descent parser
            from rd_grammar import KoreanParser
            parser = KoreanParser([":".join(p) for p in mappedPosList])
            parseTree = parser.parse(verbose=1)
            if parseTree:
                # apply any synthetic-tag-related node renamings
                parseTree.mapNodeNames()
                # extract popup wiki definitions & references links & notes for implicated nodes
                references = parseTree.getReferences()
                # build descriptive phrase list
                phrases = parseTree.phraseList()
                # get noun & verb translations from Naver
                wordDefs = getWordDefs(
                    mappedPosList) if getWordDefinitions else {}
                # build JSONable parse-tree dict
                parseTreeDict = parseTree.buildParseTree(
                    wordDefs=wordDefs, showAllLevels=showAllLevels)
                log("  {0}".format(parseTree))
            else:
                # parsing failed, return unrecognized token
                parseTree = references = parseTreeDict = phrases = None
                s.update(
                    dict(error="Sorry, failed to parse sentence",
                         lastToken=parser.lastTriedToken()))
                log("  ** failed.  Unexpected token {0}".format(
                    parser.lastTriedToken()))

        # format debugging daat
        debugging = dict(posList=pformat(s['posList']),
                         mappedPosList=pformat(mappedPosList),
                         phrases=pformat(phrases),
                         morphemeGroups=pformat(morphemeGroups),
                         parseTree=pformat(parseTreeDict),
                         references=references)

        # add parsing results to response structure
        s.update(
            dict(mappedPosList=mappedPosList,
                 morphemeGroups=morphemeGroups,
                 parseTree=parseTreeDict,
                 references=references,
                 phrases=phrases,
                 debugging=debugging))
    #
    return sentences
コード例 #25
0
class WSView(RedisView):
    def __init__(self, *args, **kwargs):
        self.drives = []
        self.drive_index = 0

        self.got_ack = False
        self.chunker = Chunker(server=True)

        self.ws = None

        super().__init__(*args, **kwargs)

    @property
    def latest_drive(self):
        if not self.drives:
            return None

        drive_id = self.drives[self.drive_index]
        self.drive_index += 1
        if self.drive_index >= len(self.drives):
            self.drive_index = 0

        return drive_id

    async def send(self, data):
        await self.ws.send_str(json.dumps(data))

    async def get(self):
        ws = aiohttp.web.WebSocketResponse(heartbeat=2)
        print('Opening a socket.')

        await ws.prepare(self.request)
        print('Websocket connection ready')

        self.got_ack = True
        self.ws = ws
        await self.create_redis()

        async for message in self.ws:
            if message.type != aiohttp.WSMsgType.TEXT:
                continue

            print(
                f"{message.data}; chunk: {self.chunker.current_chunk}; drive: {self.drive_index}; ack: {self.got_ack}; {len(self.drives)} drives"
            )
            await self.handle_message(message)

        print('Websocket connection closed')

        await self.cleanup_redis()

        return self.ws

    async def handle_message(self, message):
        data = json.loads(message.data)

        if data["cmd"] == 'close':
            await self.ws.close()
        elif data["cmd"] == "ack":
            self.got_ack = True
        elif data["cmd"] == "setchunk":
            self.chunker.set_chunk(data["i"])
        elif data["cmd"] == "ping":
            if not self.drives:
                await self.send({"cmd": "getinfo"})
                return

            if not self.got_ack:
                return

            self.got_ack = False

            await self.send({
                "cmd": "getchunk",
                "chunk_id": self.chunker.current_chunk,
                "address": self.latest_drive
            })
            self.chunker.next_chunk()
        elif data["cmd"] == "drives":
            self.drives = data["drives"]