コード例 #1
0
ファイル: parseWikidata.py プロジェクト: reconrus/IP1e
def process_chunk(chunk, input_file, output_folder, jobID, encoding):
    processed_lines = []
    for line in Chunker.parse(Chunker.read(input_file, chunk)):
        processed_line = process_line(line, output_folder)
        processed_lines.append(processed_line)

    file_path = os.path.join(
        output_folder, 'processed_wikidata_%d' % mp.current_process().pid)

    with open(file_path, 'a+', encoding=encoding) as f:
        f.writelines(processed_lines)

    print("Processed chunk #%d" % (jobID + 1))
コード例 #2
0
ファイル: run.py プロジェクト: danjamker/N-Fly
    def __init__(self, llwl='Brown', llNL=2, percen=80, NE = True, Col = True, Gram = True, Chu = True):
        '''      
        @param llwl:LogLikleyHood Corpa name ('Brown','AmE06','BE06')
        @param llNL:LogLikleyHood 
        @param percen: Presision of output default = 20, 20% returned
        @param NE: Uses NE default True 
        @param Col: Uses Collocation default True
        @param Gram: Uses N-Grams default True
        @param Chu: Uses Chunking default True
        '''

        self.NEs = NE
        self.Col = Col
        self.Gram = Gram
        self.Chu = Chu
        self.p = percen
        print 'Starting to build ', llwl
        self.LL = LogLikelihood(wordlist=llwl, NLength=llNL)
        print 'LL Loaded'
        self.POS = POS()
        print 'POS Loaded'
        self.GD = GetData()
        print 'GD Loaded'
        self.Cu = Chunker(self.POS)
        print 'Cu Loaded'
        self.FL = Filter()
        print 'FL Loaded'
        self.CC = Collocation(self.POS)
        print 'CC Loaded'
        self.Ng = NGram()
        print 'Ng Loaded'
        self.S = Select(percentil=self.p)
        print 'S Loaded'
        self.To = Tokenize(self.FL)
        print 'To Loaded'
コード例 #3
0
ファイル: run.py プロジェクト: danjamker/N-Fly
    def __init__(self,
                 llwl='Brown',
                 llNL=2,
                 percen=80,
                 NE=True,
                 Col=True,
                 Gram=True,
                 Chu=True):
        '''      
        @param llwl:LogLikleyHood Corpa name ('Brown','AmE06','BE06')
        @param llNL:LogLikleyHood 
        @param percen: Presision of output default = 20, 20% returned
        @param NE: Uses NE default True 
        @param Col: Uses Collocation default True
        @param Gram: Uses N-Grams default True
        @param Chu: Uses Chunking default True
        '''

        self.NEs = NE
        self.Col = Col
        self.Gram = Gram
        self.Chu = Chu
        self.p = percen
        print 'Starting to build ', llwl
        self.LL = LogLikelihood(wordlist=llwl, NLength=llNL)
        print 'LL Loaded'
        self.POS = POS()
        print 'POS Loaded'
        self.GD = GetData()
        print 'GD Loaded'
        self.Cu = Chunker(self.POS)
        print 'Cu Loaded'
        self.FL = Filter()
        print 'FL Loaded'
        self.CC = Collocation(self.POS)
        print 'CC Loaded'
        self.Ng = NGram()
        print 'Ng Loaded'
        self.S = Select(percentil=self.p)
        print 'S Loaded'
        self.To = Tokenize(self.FL)
        print 'To Loaded'
コード例 #4
0
# Convert the grammar trees in the corpus into a CFG (Context-Free Grammar).
grammar = funcs.InduceNonTerminal(grammarTrain)

# Save the grammar file.
pickle.dump(grammar, open("grammar.txt", "wb"))
print("Grammar induction finished.")
'''========= Part IV: Chunking ========'''
# In this part, we chunk sentences into different phrases using the IOB (Inside-Outside-Beginning) tags. There are 3 ki-
# nds of phrases: noun phrases (NP), verb phrases (VP) and preposition phrases (PP).

# Load the train and test dataset for chunking.
chunkTrain = nltk.corpus.conll2000.chunked_sents("train.txt")
chunkTest = nltk.corpus.conll2000.chunked_sents("test.txt")

# Initiate a Chunker object. Use the training corpus to train the chunker.
chunker = Chunker(chunkTrain)

# Evaluate the chunker's performance on the test corpus.
print(chunker.evaluate(chunkTest))

# Use the trained chunker to chunk our own texts.
chunkedSents = funcs.ChunkSents(tokens, chunker)

# Save the chunked texts.
pickle.dump(chunkedSents, open("chunked_sents.txt", "wb"))
print("Chunking finished.")
'''======== Part V: Deep parsing ========'''
# In this part, we used the grammar induced in previous step to parse our texts. Basically we used a shift-reduce parsi-
# ng algorithm to parse the texts and find out if there are larger phrases built on small phrases.

# Initiate a parser object. Load it with grammar.
コード例 #5
0
ファイル: run.py プロジェクト: danjamker/N-Fly
class runable(object):
    '''
    Class for selecting keywords and extracting keywords from online contentent.
    '''
    def __init__(self,
                 llwl='Brown',
                 llNL=2,
                 percen=80,
                 NE=True,
                 Col=True,
                 Gram=True,
                 Chu=True):
        '''      
        @param llwl:LogLikleyHood Corpa name ('Brown','AmE06','BE06')
        @param llNL:LogLikleyHood 
        @param percen: Presision of output default = 20, 20% returned
        @param NE: Uses NE default True 
        @param Col: Uses Collocation default True
        @param Gram: Uses N-Grams default True
        @param Chu: Uses Chunking default True
        '''

        self.NEs = NE
        self.Col = Col
        self.Gram = Gram
        self.Chu = Chu
        self.p = percen
        print 'Starting to build ', llwl
        self.LL = LogLikelihood(wordlist=llwl, NLength=llNL)
        print 'LL Loaded'
        self.POS = POS()
        print 'POS Loaded'
        self.GD = GetData()
        print 'GD Loaded'
        self.Cu = Chunker(self.POS)
        print 'Cu Loaded'
        self.FL = Filter()
        print 'FL Loaded'
        self.CC = Collocation(self.POS)
        print 'CC Loaded'
        self.Ng = NGram()
        print 'Ng Loaded'
        self.S = Select(percentil=self.p)
        print 'S Loaded'
        self.To = Tokenize(self.FL)
        print 'To Loaded'

    def Select(self, url, depth):
        '''
        Determin the best keywords for a webpage.
        
        @param url: the base url to start sampaling from
        @param depth: the depth of the website to be sampled
        
        @return: the list of selected keywords, ordered with the highest rated words to the lower bownd of array.
        '''
        #Get data from web page
        text = self.GD.getWebPage(url, depth)

        #Tokonize sentance and words
        tok = self.To.Tok(text)

        #POS tag the text
        pos = self.POS.POSTag(tok, 'tok')

        #Log Likly Hood
        log = self.LL.calcualte(tok)

        #Collocations
        if self.Col == True:
            col = self.CC.col(pos, tok)
        else:
            col = []

        #NE Extraction
        if self.NEs == True:
            ne = self.Cu.Chunks(pos,
                                nodes=['PERSON', 'ORGANIZATION', 'LOCATION'])
        else:
            ne = []

        #Extract NP
        if self.Chu == True:
            chu = [self.Cu.parse(p) for p in pos]
        else:
            chu = []

        #Creat N-gram
        if self.Gram == True:
            ga = self.Ng.Grams(pos, n=6)
        else:
            ga = []

        return self.S.keywords(ne, ga, col, chu, log)
コード例 #6
0
ファイル: run.py プロジェクト: danjamker/N-Fly
class runable(object):
    '''
    Class for selecting keywords and extracting keywords from online contentent.
    '''

    def __init__(self, llwl='Brown', llNL=2, percen=80, NE = True, Col = True, Gram = True, Chu = True):
        '''      
        @param llwl:LogLikleyHood Corpa name ('Brown','AmE06','BE06')
        @param llNL:LogLikleyHood 
        @param percen: Presision of output default = 20, 20% returned
        @param NE: Uses NE default True 
        @param Col: Uses Collocation default True
        @param Gram: Uses N-Grams default True
        @param Chu: Uses Chunking default True
        '''

        self.NEs = NE
        self.Col = Col
        self.Gram = Gram
        self.Chu = Chu
        self.p = percen
        print 'Starting to build ', llwl
        self.LL = LogLikelihood(wordlist=llwl, NLength=llNL)
        print 'LL Loaded'
        self.POS = POS()
        print 'POS Loaded'
        self.GD = GetData()
        print 'GD Loaded'
        self.Cu = Chunker(self.POS)
        print 'Cu Loaded'
        self.FL = Filter()
        print 'FL Loaded'
        self.CC = Collocation(self.POS)
        print 'CC Loaded'
        self.Ng = NGram()
        print 'Ng Loaded'
        self.S = Select(percentil=self.p)
        print 'S Loaded'
        self.To = Tokenize(self.FL)
        print 'To Loaded'
    
    def Select(self, url, depth): 
        '''
        Determin the best keywords for a webpage.
        
        @param url: the base url to start sampaling from
        @param depth: the depth of the website to be sampled
        
        @return: the list of selected keywords, ordered with the highest rated words to the lower bownd of array.
        '''     
        #Get data from web page
        text = self.GD.getWebPage(url, depth)

        #Tokonize sentance and words
        tok = self.To.Tok(text)

        #POS tag the text
        pos = self.POS.POSTag(tok, 'tok')

        #Log Likly Hood
        log = self.LL.calcualte(tok)
 
        #Collocations
        if self.Col == True:
            col = self.CC.col(pos, tok)
        else:
            col = []

        #NE Extraction
        if self.NEs == True:
            ne = self.Cu.Chunks(pos, nodes=['PERSON', 'ORGANIZATION', 'LOCATION'])
        else:
            ne = []
         
        #Extract NP
        if self.Chu == True:
            chu = [self.Cu.parse(p) for p in pos]
        else:
            chu = []
        
        #Creat N-gram 
        if self.Gram == True:
            ga = self.Ng.Grams(pos, n=6)
        else:
            ga = []
        
        return self.S.keywords(ne, ga , col , chu, log)
コード例 #7
0
ファイル: parseWikidata.py プロジェクト: reconrus/IP1e
                        dest="chunk_size",
                        default=1024 * 1024 * 1024)
    args = parser.parse_args()
    args.chunk_size = int(args.chunk_size)

    if not os.path.exists(args.output):
        os.mkdir(args.output)

    logs_file = open(os.path.join(args.output, "logs.txt"), 'a+')

    pool = mp.Pool(mp.cpu_count())
    jobs = []
    start_time = time.time()
    print("Chunks of size %s" % size(args.chunk_size), file=logs_file)
    for jobID, chunk in enumerate(
            Chunker.chunkify(args.input, size=args.chunk_size)):
        job = pool.apply_async(
            process_chunk,
            (chunk, args.input, args.output, jobID, args.encoding))
        jobs.append(job)

    output = []
    for job in jobs:
        job.get()

    pool.close()

    print("Total # of chunks: %d" % (jobID + 1), file=logs_file)
    print("Total time: {}".format(
        datetime.timedelta(seconds=time.time() - start_time)),
          file=logs_file)