コード例 #1
0
ファイル: crawler.py プロジェクト: thekiminlee/Web-Crawler
 def process(self, text, doc_id):
     '''
     calls on the TextProcessor class for text processing and tokenization of documents.
     each url is designated with unique id number.
     id is increased at the end for new url.
     returns updated index.
     '''
     processor = TextProcessor(text, self.index, doc_id)
     processor.process_text()
     return processor.get_index()
コード例 #2
0
ファイル: prepare_data.py プロジェクト: ShT3ch/luigi_workshop
    def run(self):
        logger.info('Creating text processor')
        text_processor = TextProcessor()

        for file in self.input().keys():
            logger.info('Reading %s file: "%s"', file, self.input()[file].path)
            df = pd.read_csv(self.input()[file].path)

            logger.info('Its %s lines', df.shape[0])
            logger.info('Start processing %s...', file)

            df.name = df.name.map(lambda x: text_processor.process_text(x, lang='ru'))
            df.name = df.name.map(lambda x: ' '.join(x))

            logger.info('Processing of %s succeed, writing it to "%s"', file, self.output()[file].path)

            df.to_csv(self.output()[file].path)
コード例 #3
0
if __name__ == "__main__":
    pp = pprint.PrettyPrinter(indent=4, depth=2)

    # Initialize classifier
    classifier = NaiveBayes()

    # Train
    for f in find("data/1/training"):
        f = f.strip()
        if not f.endswith(".txt"):
            continue

        with open(f) as doc:
            text = doc.read()

        sentences = nlp.process_text(text)

        label = "movie" if "movie" in f else "play"

        classifier.train(sentences, label=label)

    # Test
    for f in find("data/1/testing"):
        f = f.strip()
        if not f.endswith(".txt"):
            continue

        with open(f) as doc:
            text = doc.read()

        sentences = nlp.process_text(text)