def __init__(self,
                 stop_words_file=None,
                 allow_speech_tags=util.allow_speech_tags,
                 delimiters=util.sentence_delimiters):
        """
        Keyword arguments:
        stop_words_file  --  str,停止词文件路径,若不是str则是使用默认停止词文件
        delimiters       --  默认值是`?!;?!。;…\n`,用来将文本拆分为句子。
        
        Object Var:
        self.sentences               --  由句子组成的列表。
        self.words_no_filter         --  对sentences中每个句子分词而得到的两级列表。
        self.words_no_stop_words     --  去掉words_no_filter中的停止词而得到的两级列表。
        self.words_all_filters       --  保留words_no_stop_words中指定词性的单词而得到的两级列表。
        """
        self.seg = Segmentation(stop_words_file=stop_words_file,
                                allow_speech_tags=allow_speech_tags,
                                delimiters=delimiters)

        self.sentences = None
        self.words_no_filter = None  # 2维列表
        self.words_no_stop_words = None
        self.words_all_filters = None

        self.key_sentences = None
        self.embeddings_index = self.w2v()
    def run(self):
        """Executes segmentation for each input provided.

        For each pair of input created, runs the segmentation algorithm. If
        there are multiple organs in the same input, it will produce only one
        solution.

        """
        print("Started running...")
        for input in self.input_data:
            start_time = time.time()
            
            segments = self.get_segments(input)

            print("\nExtracting from {} file.".format(segments[0].file_name))
            print("Organs found: {}".format(len(segments)))
            
            original_image = segments[0].normalize(segments[0].image)
            original_image = segments[0].preprocess(original_image.image)

            sections_segmented = []
            for segment in segments:
                segmentation = Segmentation(segment)
                section_segmented = segmentation.execute()
                
                sections_segmented.append(section_segmented)
            
            self.save_solution(sections_segmented, original_image.image)
            elapsed_time = int(time.time() - start_time)

            minutes = elapsed_time % 3600 // 60
            seconds = elapsed_time % 60
            print("Finished extracting in {:02d}:{:02d}".format(minutes, 
                                                                seconds))
        print("Finished extracting.")
def main(opt):

    k1 = Keyword(opt['keyword1'], opt['data_dir'])
    text1 = k1.get_all_context_text()
    text1_repost = k1.get_all_repost_text()

    k2 = Keyword(opt['keyword2'], opt['data_dir'])
    text2 = k2.get_all_context_text()
    text2_repost = k2.get_all_repost_text()

    seg = Segmentation()
    document = seg.segmentation([text1, text2, text1_repost, text2_repost])

    ex = SubKeyExtractor()
    ex.fit(document)
    results = ex.extract(document, 20)

    print(results)

    scorer = SimilarityScore(topn=20)
    score = scorer.score(k1.get_top_context_text(), results[0], results[2],
                         k2.get_top_context_text(), results[1], results[3])

    print(score)

    print('done')
Esempio n. 4
0
    def __init__(self, stop_words_file=None, delimiters='?!;?!。;…\n'):
        '''
		stop_words_file: 默认为None,若设置为文件路径,将由该文件构造停止词过滤器
		delimiters: 分隔符集合
		'''
        super(KeywordExtraction, self).__init__()
        '''变量说明
		self.keywords: 关键词列表
        self.words_no_filter:对text进行分词而得到的两级列表(每行为一个句子的分词结果列表)。
        self.words_no_stop_words:对text进行分词,同时去掉停止词而得到的两级列表。
        self.words_all_filters:对text进行分词,同时去停止词,保留指定词性的单词而得到的两级列表。
        self.word_index: 字典(单词-下标)
        self.index_word: 字典(下标-单词)
        self.graph: 由单词构成的图,用于pagerank算法
		'''
        self.text = ''
        self.keywords = []
        self.seg = Segmentation(stop_words_file=stop_words_file,
                                delimiters=delimiters)

        self.words_no_filter = None
        self.words_no_stop_words = None
        self.words_all_filters = None

        self.word_index = {}
        self.index_word = {}
        self.graph = None
Esempio n. 5
0
    def __init__(self,
                 stop_words_file=None,
                 allow_speech_tags=util.allow_speech_tags,
                 delimiters=util.sentence_delimiters):
        """
        Keyword arguments:
        stop_words_file  --  str,指定停止词文件路径(一行一个停止词),若为其他类型,则使用默认停止词文件
        delimiters       --  默认值是`?!;?!。;…\n`,用来将文本拆分为句子。
        
        Object Var:
        self.words_no_filter      --  对sentences中每个句子分词而得到的两级列表。
        self.words_no_stop_words  --  去掉words_no_filter中的停止词而得到的两级列表。
        self.words_all_filters    --  保留words_no_stop_words中指定词性的单词而得到的两级列表。
        """
        self.text = ''
        self.keywords = None

        self.seg = Segmentation(stop_words_file=stop_words_file,
                                allow_speech_tags=allow_speech_tags,
                                delimiters=delimiters)

        self.sentences = None
        self.words_no_filter = None  # 2维列表
        self.words_no_stop_words = None
        self.words_all_filters = None
Esempio n. 6
0
    def __init__(self, stop_words_file = None, allow_speech_tags = utils.allow_speech_tags, delimiters = utils.sentence_delimiters):
        self.seg = Segmentation(stop_words_file=stop_words_file, allow_speech_tags=allow_speech_tags, delimiters=delimiters)
        # [s1, s2, ...]
        self.sentences = None
        # 2-dim list: [[w1, w2, ...], [w1, w2, ...]]
        self.words_no_filter = None
        self.words_no_stop_words = None
        self.words_all_filters = None

        self.key_sentences = None
def runStuff(imgInput, saveToImagePath, readFromImagePath, hueMomentIndexX, hueMomentIndexY):
    p = Preprocessing(saveToImagePath, readFromImagePath, imgInput)
    s = Segmentation(saveToImagePath, readFromImagePath, p.imgMorph, imgInput)
    listX = s.getHuMomentsOfAllContours(s.contoursFrontGroundFiltered, hueMomentIndexX)
    listY = s.getHuMomentsOfAllContours(s.contoursFrontGroundFiltered, hueMomentIndexY)

    print "So the listX is:", listX
    print "So the listY is:", listY

    return listX, listY
Esempio n. 8
0
def runStuff(imgInput, saveToImagePath, readFromImagePath, hueMomentIndexX,
             hueMomentIndexY):
    p = Preprocessing(saveToImagePath, readFromImagePath, imgInput)
    s = Segmentation(saveToImagePath, readFromImagePath, p.imgMorph, imgInput)
    listX = s.getHuMomentsOfAllContours(s.contoursFrontGroundFiltered,
                                        hueMomentIndexX)
    listY = s.getHuMomentsOfAllContours(s.contoursFrontGroundFiltered,
                                        hueMomentIndexY)

    print "So the listX is:", listX
    print "So the listY is:", listY

    return listX, listY
	def __init__(self, stop_words_file = None, delimiters='?!;?!。;…\n'):
		'''
		stop_words_file: 默认为None,若设置为文件路径,将由该文件构造停止词过滤器
		delimiters: 分隔符集合
		'''
		super(SentenceExtraction, self).__init__()
		self.seg = Segmentation(stop_words_file=stop_words_file, delimiters=delimiters)
		self.sentences = None
		self.words_no_filter = None
		self.words_no_stop_words = None
		self.words_all_filters = None

		self.graph = None
		self.key_sentences = None
Esempio n. 10
0
    def __init__(self, stop_words_file=None,
                 allow_speech_tags=util.allow_speech_tags,
                 delimiters=util.sentence_delimiters):

        self.text = ''
        self.keywords = None

        self.seg = Segmentation(stop_words_file=stop_words_file,
                                allow_speech_tags=allow_speech_tags,
                                delimiters=delimiters)

        self.sentences = None
        self.words_no_filter = None  # 2维列表
        self.words_no_stop_words = None
        self.words_all_filters = None
Esempio n. 11
0
    def __init__(self, stop_words_file=None,
                 allow_speech_tags=util.allow_speech_tags,
                 delimiters=util.sentence_delimiters):

        self.seg = Segmentation(stop_words_file=stop_words_file,
                                allow_speech_tags=allow_speech_tags,
                                delimiters=delimiters)
       # self.sentences               --  由句子组成的列表。
       # self.words_no_filter         --  对sentences中每个句子分词而得到的两级列表。

        self.sentences = None
        self.words_no_filter = None  # 2维列表
        self.words_no_stop_words = None
        self.words_all_filters = None

        self.key_sentences = None
Esempio n. 12
0
    def __init__(self, stop_words_file=None, delimiters="?!;?!。;…\n"):
        """
		stop_words_file: 默认为None,若设置为文件路径,将由该文件构造停止词过滤器
		delimiters: 分隔符集合
		"""
        super(KeywordExtraction, self).__init__()
        """变量说明
		self.keywords: 关键词列表
        self.words_no_filter:对text进行分词而得到的两级列表(每行为一个句子的分词结果列表)。
        self.words_no_stop_words:对text进行分词,同时去掉停止词而得到的两级列表。
        self.words_all_filters:对text进行分词,同时去停止词,保留指定词性的单词而得到的两级列表。
        self.word_index: 字典(单词-下标)
        self.index_word: 字典(下标-单词)
        self.graph: 由单词构成的图,用于pagerank算法
		"""
        self.text = ""
        self.keywords = []
        self.seg = Segmentation(stop_words_file=stop_words_file, delimiters=delimiters)

        self.words_no_filter = None
        self.words_no_stop_words = None
        self.words_all_filters = None

        self.word_index = {}
        self.index_word = {}
        self.graph = None
Esempio n. 13
0
def seg_from_json(fname, gt_flag):
    with open(fname) as data_file:
        data = json.load(data_file)

    annotations = {}
    if gt_flag:
        annotations = data[0]["annotations"]
    else:
        annotations = data["annotations"]
    # print annotations[0].items()

    # TODO: make sure to address the box type (article/image/title) assignment problem (i.e.
    # at box or polygon level)

    polygon_dict = {}
    # polygon_dict has key = id, value = list of boxes
    for segment in annotations:
        if segment['id'] not in polygon_dict:
            polygon_dict[segment['id']] = []
        coord0 = [segment['y'], segment['x']]
        coord1 = [coord0[0] + segment['height'], coord0[1] + segment['width']]
        polygon_dict[segment['id']].append(Box(coor0=coord0, coor1=coord1))

    # convert polygons dict to Segmentation object
    #seg = Segmentation()

    polygons = [Polygon(boxes=boxList) for boxList in polygon_dict.values()]
    return Segmentation(segments=polygons)
Esempio n. 14
0
def box_from_json(fname):
    with open(fname) as data_file:
        data = json.load(data_file)
    annotations = data[0]["annotations"]
    polygon_dict = {}
    # polygon_dict has key = id, value = list of boxes

    id = 0
    for segment in annotations:
        id_str = str(id)
        if id_str not in polygon_dict:
            polygon_dict[id_str] = []
        id += 1
        coord0 = [segment['y'], segment['x']]
        coord1 = [coord0[0] + segment['height'], coord0[1] + segment['width']]
        polygon_dict[id_str].append(
            Box(coor0=coord0, coor1=coord1, label=segment['class']))
    # convert polygons dict to Segmentation object

    polygons = [Polygon(boxes=boxList) for boxList in polygon_dict.values()]
    #img_name = fname[:fname.rfind('/') + 1] + data[0]['filename']
    # newsimage = NewsImage(img_name[:img_name.rfind('.')])
    # print img_name[:img_name.rfind('.')]
    #for p in polygons:
    #	p.weight_image = newsimage
    #Polygon.weight_image = newsimage
    return Segmentation(segments=polygons)
Esempio n. 15
0
class Predict:
    finder = Finder()
    segmenter = Segmentation()
    charcterClassfier = Character()
    write_location = 'output/unconfirmed/'
    logger = logging.getLogger(__name__)
    logger.setLevel(logging.DEBUG)

    def predict(self, img):
        licenceplates_img = self.finder.find_lps(img)
        licenceplates = []
        for lp in licenceplates_img:
            seg_img = self.segmenter.segment(lp)
            lp = self.segmenter.get_preprocessed_img()
            seg_img, character_list = self.isolate_chars(seg_img)
            if len(character_list) >= 6 and len(character_list) < 12:
                result_text =  self.charcterClassfier.classify(character_list)
                character_list = self.charcterClassfier.getPrepImgs()
                licenceplates.append(result_text)
                self.save(img, lp, seg_img, character_list, result_text)
            else:
                self.logger.info("{} characters found in licence plate, discarding..".format(len(character_list)))
        return licenceplates
    
    def isolate_chars(self, img):
        chars = []
        orig_thresh = cv2.bitwise_not(img)
        #cv2.imshow("out2", orig_thresh)
        #cv2.waitKey(0)
        image, contours, _ = cv2.findContours(orig_thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
        cleaned_img = np.full_like(img, 255)
        for contour in contours:
            x,y,w,h = cv2.boundingRect(contour)
            #print(x,y,w,h)
            if w > 10 and w < 80 and h > 40 and h < 100:
            #if w > 2 and h > 10:
                roi_img = img[y:y+h,x:x+w]
                cleaned_img[y:y+h,x:x+w] = img[y:y+h,x:x+w]
                chars.append([x, roi_img])
        chars = sorted(chars, key=lambda item: item[0]) # sort on x value
        chars = list(map(lambda item: item[1], chars)) # return only images
        return cleaned_img, chars
    
    def save(self, img, lp, seg_img, char_list, result):
        rand_str = self.id_generator() + "/"
        directory = self.write_location + rand_str
        if not os.path.exists(directory):
            os.makedirs(directory)
            cv2.imwrite(directory + "orig.png", img)
            cv2.imwrite(directory + "lp.png", lp)
            cv2.imwrite(directory + "seg.png", seg_img)
            for i,char in enumerate(char_list):
                cv2.imwrite(directory + str(i)+".png", char)
            with open(directory + "result.txt", 'w') as result_file:
                result_file.write(result)

    def id_generator(self, size=8, chars=string.ascii_uppercase + string.digits):
        return ''.join(random.choice(chars) for _ in range(size))
Esempio n. 16
0
def run_selected_test(file_name, inst_number, sil, snd, th):
    os.system("rm -r Images/*")
    Test = Segmentation(inst_number, sil, snd, th)
    Test.segment_audio(file_name)
    Test.get_durations()

    Test.std_dev = Results().std_dev(Test.durations_list)
    Test.average = Results().average(Test.durations_list)

    Results().plot_durations(Test.durations_list, Test.inst_number)
Esempio n. 17
0
class TextRank4Sentence(object):

    def __init__(self, stop_words_file=None,
                 allow_speech_tags=util.allow_speech_tags,
                 delimiters=util.sentence_delimiters):

        self.seg = Segmentation(stop_words_file=stop_words_file,
                                allow_speech_tags=allow_speech_tags,
                                delimiters=delimiters)
       # self.sentences               --  由句子组成的列表。
       # self.words_no_filter         --  对sentences中每个句子分词而得到的两级列表。

        self.sentences = None
        self.words_no_filter = None  # 2维列表
        self.words_no_stop_words = None
        self.words_all_filters = None

        self.key_sentences = None

    def analyze(self, text, lower=False,
                source='no_stop_words',
                sim_func=util.get_similarity,
                pagerank_config={'alpha': 0.85, }):


        self.key_sentences = []

        result = self.seg.segment(text=text, lower=lower)
        self.sentences = result.sentences
        self.words_no_filter = result.words_no_filter
        self.words_no_stop_words = result.words_no_stop_words
        self.words_all_filters = result.words_all_filters

        options = ['no_filter', 'no_stop_words', 'all_filters']
        if source in options:
            _source = result['words_' + source]
        else:
            _source = result['words_no_stop_words']

        self.key_sentences = util.sort_sentences(sentences=self.sentences,
                                                 words=_source,
                                                 sim_func=sim_func,
                                                 pagerank_config=pagerank_config)

    def get_key_sentences(self, num=6, sentence_min_len=6):
        """获取最重要的num个长度大于等于sentence_min_len的句子用来生成摘要。
"""
        result = []
        count = 0
        for item in self.key_sentences:
            if count >= num:
                break
            if len(item['sentence']) >= sentence_min_len:
                result.append(item)
                count += 1
        return result
Esempio n. 18
0
    def __init__(self,
                 stop_words_file=None,
                 allow_speech_tags=utils.allow_speech_tags,
                 delimiters=utils.sentence_delimiters):
        """

        :param stop_words_file: 停用词文件的路径
        :param allow_speech_tags:
        :param delimiters: 拆分句子的标点符号列表
        """
        self.text = ''
        self.keywords = None
        self.seg = Segmentation(stop_words_file=stop_words_file,
                                allow_speech_tags=allow_speech_tags,
                                delimiters=delimiters)
        self.sentences = None
        self.words_no_filter = None
        self.words_no_stop_words = None
        self.words_all_filters = None
Esempio n. 19
0
    def __init__(self, stop_words_file=None, delimiters='?!;?!。;…\n'):
        '''
        `stop_words_file`:默认值为None,此时内部停止词表为空;可以设置为文件路径(字符串),将从停止词文件中提取停止词。
        `delimiters`:默认值是`'?!;?!。;…\n'`,用来将文本拆分为句子。
        
        self.sentences:由句子组成的列表。
        self.words_no_filter:对sentences中每个句子分词而得到的两级列表。
        self.words_no_stop_words:去掉words_no_filter中的停止词而得到的两级列表。
        self.words_all_filters:保留words_no_stop_words中指定词性的单词而得到的两级列表。
        '''
        self.seg = Segmentation(stop_words_file=stop_words_file,
                                delimiters=delimiters)

        self.sentences = None
        self.words_no_filter = None  # 2维列表
        self.words_no_stop_words = None
        self.words_all_filters = None

        self.graph = None
        self.key_sentences = None
Esempio n. 20
0
def save_data(result_file, data):
    """
    saves multiple numpy arrays in one file.
    :param result_file:
    :param data: takes a generator of numpy array
    :return:
    """
    segment = Segmentation()
    # for element in data:
    try:
        while True:
            element = next(data)  # generator
            element = segment.format_data(segment.segmentMaryam(element))
            with open(result_file, 'a') as f_handle:
                np.savetxt(f_handle, element,  delimiter=" ", footer='####################################'
                                                                     '######################################'
                                                                     '#######################################'
                                                                     '############')
    except StopIteration:
        pass
Esempio n. 21
0
def pre_processing(image_file, file):

    biggest_component = Remove_Calibration.cutout_img(image_file)
    #Remove_Calibration.save_image(biggest_component, file.split(".")[0])

    bin_image = Binarization.binarize(biggest_component)
    #Binarization.save_image(bin_image, file.split(".")[0])

    # Returns a list of segmented characters and number of rows
    cropped_characters, row, no_ofChar = Segmentation.segmentation(bin_image)
    # Segmentation.save_segmented_characters(cropped_characters, row, file)

    return cropped_characters, row, no_ofChar
Esempio n. 22
0
def save_data(result_file, data):
    """
    saves multiple numpy arrays in one file.
    :param result_file:
    :param data: takes a generator of numpy array
    :return:
    """
    segment = Segmentation()
    # for element in data:
    try:
        while True:
            element = next(data)  # generator
            element = segment.format_data(segment.segmentMaryam(element))
            with open(result_file, 'a') as f_handle:
                np.savetxt(f_handle,
                           element,
                           delimiter=" ",
                           footer='####################################'
                           '######################################'
                           '#######################################'
                           '############')
    except StopIteration:
        pass
Esempio n. 23
0
def main():
    model1 = ModelDef(tf.keras.models.load_model("models/minstM.h5"),
                      model1swaps)
    model2 = ModelDef(tf.keras.models.load_model("models/model2.h5"),
                      model2swaps)
    ansamble = Ansamble([model1, model2], classes)
    segmentator = Segmentation(max_norm=254)
    segmentatorcv = SegmentatorCV()
    commander = Commander(ansamble, segmentator)
    #print(commander.eval("2p2.bmp"))
    print(commander.eval("txt2.bmp"))
    #print(commander.eval("complex_expression.bmp"))
    #print(commander.eval("extreme2.bmp"))
    #segmentatorcv.writeDigetsToFolder("extreme2.bmp", "ims")
    commander.writeDigetsToFolder("txt2.bmp", "ims")
Esempio n. 24
0
def DetectLP1():
    # Load
    origin = cv2.imread(fullpath)
    # Denoise
    origin = Denoise(origin)
    # Resize
    h, w, c = origin.shape
    img = cv2.resize(origin, ((w * 200) / h, 200))
    # Segmentation
    out1, out2 = Segmentation(img=opencv2skimage(img), Debug=True)
    # Blue Color Filter
    mask, res = ColorFilter(origin=skimage2opencv(out1), Debug=False)
    # Mark
    box = findBBox(skimage2opencv(img), mask, Debug=True)
    # Check Candidate
    drawBBox(img, box, Debug=True)
Esempio n. 25
0
class TextRank4Sentence(object):
    def __init__(self, stop_words_file = None, allow_speech_tags = utils.allow_speech_tags, delimiters = utils.sentence_delimiters):
        self.seg = Segmentation(stop_words_file=stop_words_file, allow_speech_tags=allow_speech_tags, delimiters=delimiters)
        # [s1, s2, ...]
        self.sentences = None
        # 2-dim list: [[w1, w2, ...], [w1, w2, ...]]
        self.words_no_filter = None
        self.words_no_stop_words = None
        self.words_all_filters = None

        self.key_sentences = None

    def analyze(self, text, lower = False,
                source = 'no_stop_words',
                sim_func = utils.get_similarity,
                damping_coeffi=0.85):
        self.key_sentences = list()
        result = self.seg.segment(text=text, lower=lower)

        self.sentences = result.sentences

        self.words_no_filter = result.words_no_filter
        self.words_no_stop_words = result.words_no_stop_words
        self.words_all_filters = result.words_all_filters
        options = ['no_filter', 'no_stop_words', 'all_filters']

        if source in options:
            source_ = result['words_' + source]
        else:
            source_ = result['words_no_stop_words']

        self.key_sentences = utils.sort_sentences(sentences=self.sentences, words=source_, sim_func=sim_func, damping_coeffi=damping_coeffi)


    def get_key_sentences(self, num = 5, sentence_min_len = 6):
        """get num sentences whose length is >= min_len"""
        result = list()
        cnt = 0
        for item in self.key_sentences:
            if cnt >= num:
                break
            if len(item['sentence']) >= sentence_min_len:
                result.append(item)
                cnt += 1
        return result
 def __init__(self, stop_words_file = None, delimiters='?!;?!。;…\n'):
     '''
     `stop_words_file`:默认值为None,此时内部停止词表为空;可以设置为文件路径(字符串),将从停止词文件中提取停止词。
     `delimiters`:默认值是`'?!;?!。;…\n'`,用来将文本拆分为句子。
     
     self.sentences:由句子组成的列表。
     self.words_no_filter:对sentences中每个句子分词而得到的两级列表。
     self.words_no_stop_words:去掉words_no_filter中的停止词而得到的两级列表。
     self.words_all_filters:保留words_no_stop_words中指定词性的单词而得到的两级列表。
     '''
     self.seg = Segmentation(stop_words_file=stop_words_file, delimiters=delimiters)
     
     self.sentences = None
     self.words_no_filter = None     # 2维列表
     self.words_no_stop_words = None
     self.words_all_filters = None
     
     self.graph = None
     self.key_sentences = None
Esempio n. 27
0
    def create_instances(
        self
    ):  #Crea todas las instancias de la clase Segmentation para cada set de parámetros distintos.
        st_dev = []
        av_dur = []
        for i in range(0, len(self.all_combinations)):
            print("Cargando Prueba " + str(i) + "...\n")

            self.inst_list.append(
                Segmentation(i, self.all_combinations[i][0],
                             self.all_combinations[i][1],
                             self.all_combinations[i][2]))
            self.inst_list[i].segment_audio(self.file_name)
            self.inst_list[i].get_durations()

            lista_dur = self.inst_list[i].durations_list
            self.inst_list[i].std_dev = Results().std_dev(lista_dur)
            self.inst_list[i].average = Results().average(lista_dur)

            Results().plot_durations(lista_dur, self.inst_list[i].inst_number)

            self.inst_list[i].delete_audio()

            # Este bloque es para crear el dataframe de todas las duraciones de todas las pruebas
            self.Durations_DF = Results().get_durations_df(
                self.inst_list[i], self.Durations_DF)

            # Este bloque es para crear el dataframe de la duracion promedio y desviacion estandar de cada prueba

            st_dev.append(self.inst_list[i].std_dev)
            av_dur.append(self.inst_list[i].average)

            self.Summary_DF = Results().get_summary_df(self.inst_list[i],
                                                       st_dev, av_dur, i,
                                                       self.Summary_DF)

        # Este bloque agrega al dataframe los valores maximos y minimos de duracion promedio y desviacion estandar
        # self.Summary_DF.loc[0, 'MAX_Dur'] = max(av_dur)
        # self.Summary_DF.loc[0, 'MIN_Dur'] = min(av_dur)
        # self.Summary_DF.loc[0, 'MAX_STD'] = max(st_dev)
        # self.Summary_DF.loc[0, 'MIN_STD'] = min(st_dev)

        Results().create_CSV(self.Summary_DF, self.Durations_DF)
class TextRank4Sentence(object):
    
    def __init__(self, stop_words_file = None, delimiters='?!;?!。;…\n'):
        '''
        `stop_words_file`:默认值为None,此时内部停止词表为空;可以设置为文件路径(字符串),将从停止词文件中提取停止词。
        `delimiters`:默认值是`'?!;?!。;…\n'`,用来将文本拆分为句子。
        
        self.sentences:由句子组成的列表。
        self.words_no_filter:对sentences中每个句子分词而得到的两级列表。
        self.words_no_stop_words:去掉words_no_filter中的停止词而得到的两级列表。
        self.words_all_filters:保留words_no_stop_words中指定词性的单词而得到的两级列表。
        '''
        self.seg = Segmentation(stop_words_file=stop_words_file, delimiters=delimiters)
        
        self.sentences = None
        self.words_no_filter = None     # 2维列表
        self.words_no_stop_words = None
        self.words_all_filters = None
        
        self.graph = None
        self.key_sentences = None
        
    def train(self, text, lower = False, speech_tag_filter=True,
              source = 'no_stop_words', sim_func = 'standard'):
        ''' 
       `text`:文本内容,字符串。
        `lower`:是否将文本转换为小写。默认为False。
        `speech_tag_filter`:若值为True,将调用内部的词性列表来过滤生成words_all_filters。
                        若值为False,words_all_filters与words_no_stop_words相同。
        `source`:选择使用words_no_filter, words_no_stop_words, words_all_filters中的哪一个来生成句子之间的相似度。
                默认值为`'all_filters'`,可选值为`'no_filter', 'no_stop_words', 'all_filters'`。
        `sim_func`: 指定计算句子相似度的函数。当前只有一个函数,对应默认值`standard`。
        '''
        
        self.key_sentences = []
        
        (self.sentences, self.words_no_filter, self.words_no_stop_words, self.words_all_filters) = self.seg.segment(text=text,  
                                                                                                                    lower=lower, 
                                                                                                                    speech_tag_filter=speech_tag_filter);
        # -
        
        # print self.sentences   
                                                                                                          
        if source == 'no_filter':
            source = self.words_no_filter
        elif source == 'all_filters':
            source = self.words_all_filters
        else:
            source = self.words_no_stop_words
            
        sim_func = self._get_similarity_standard
        
        sentences_num = len(source)
        
        self.graph = np.zeros((sentences_num, sentences_num))
        
        for x in xrange(sentences_num):
            for y in xrange(x, sentences_num):
                similarity = sim_func(source[x], source[y])
                self.graph[x, y] = similarity
                self.graph[y, x] = similarity
                
#         for x in xrange(sentences_num):
#             row_sum = np.sum(self.graph[x, :])
#             if row_sum > 0:
#                 self.graph[x, :] = self.graph[x, :] / row_sum
                
        # print self.graph
                
        nx_graph = nx.from_numpy_matrix(self.graph)
        scores = nx.pagerank(nx_graph) # this is a dict
        sorted_scores = sorted(scores.items(), key = lambda item: item[1], reverse=True)
        
        # print sorted_scores
        
        for index, _ in sorted_scores:
            self.key_sentences.append(self.sentences[index])
        
        return   self.sentences  
        # print '\n'.join(self.key_sentences)
        

    def _get_similarity_standard(self, word_list1, word_list2):
        ''' 
        默认的用于计算两个句子相似度的函数。
        word_list1, word_list2: 分别代表两个句子,都是由单词组成的列表
        '''
        vector1, vector2 =self._gen_vectors(word_list1, word_list2)
        
        # print vector1, vector2
        
        vector3 = [vector1[x]*vector2[x]  for x in xrange(len(vector1))]
        vector4 = [1 for num in vector3 if num > 0.]
        co_occur_num = sum(vector4)
        
        # print co_occur_num
        
        if co_occur_num == 0.:
            return 0.
        
        denominator = math.log(float(len(word_list1))) + math.log(float(len(word_list2))) # 分母
        
        if denominator == 0.:
            return 0.
        
        return co_occur_num / denominator
        
        
    def _gen_vectors(self, word_list1, word_list2):
        '''
        两个句子转换成两个同样大小向量。可以通过这两个向量来计算两个句子的相似度。
        word_list1, word_list2: 分别代表两个句子,都是由单词组成的列表
        '''
        words = list(set(word_list1 + word_list2))        
        vector1 = [float(word_list1.count(word)) for word in words]
        vector2 = [float(word_list2.count(word)) for word in words]
        return vector1, vector2
            
    def get_key_sentences(self, num = 6, sentence_min_len = 6):
        '''
        获取最重要的num个长度大于等于sentence_min_len的句子用来生成摘要。
        返回列表。
        '''
        result = []
        count = 0
        for sentence in self.key_sentences:
            if count >= num:
                break
            if len(sentence) >= sentence_min_len:
                result.append(sentence)
                count += 1
        return result
Esempio n. 29
0
class TextRank4Keyword(object):
    #delimiters       --  默认值是`?!;?!。;…\n`,用来将文本拆分为句子。
    #self.words_no_filter      --  对sentences中每个句子分词而得到的两级列表。
    #self.words_no_stop_words - -  去掉words_no_filter中的停止词而得到的两级列表。
    #self.words_all_filters - -  保留words_no_stop_words中指定词性的单词而得到的两级列表。
    def __init__(self, stop_words_file=None,
                 allow_speech_tags=util.allow_speech_tags,
                 delimiters=util.sentence_delimiters):

        self.text = ''
        self.keywords = None

        self.seg = Segmentation(stop_words_file=stop_words_file,
                                allow_speech_tags=allow_speech_tags,
                                delimiters=delimiters)

        self.sentences = None
        self.words_no_filter = None  # 2维列表
        self.words_no_stop_words = None
        self.words_all_filters = None

    def analyze(self, text,#对关键词进行网络计算
                window=2,
                lower=False,
                vertex_source='all_filters',
                edge_source='no_stop_words',
                pagerank_config={'alpha': 0.85, }):
        #text - -  文本内容,字符串。
        #window - -  窗口大小,int,用来构造单词之间的边。默认值为2。
        #lower - -  是否将文本转换为小写。默认为False。




        # self.text = util.as_text(text)
        self.text = text
        self.word_index = {}
        self.index_word = {}
        self.keywords = []
        self.graph = None

        result = self.seg.segment(text=text, lower=lower)
        self.sentences = result.sentences
        self.words_no_filter = result.words_no_filter
        self.words_no_stop_words = result.words_no_stop_words
        self.words_all_filters = result.words_all_filters

        options = ['no_filter', 'no_stop_words', 'all_filters']

        if vertex_source in options:
            _vertex_source = result['words_' + vertex_source]
        else:
            _vertex_source = result['words_all_filters']

        if edge_source in options:
            _edge_source = result['words_' + edge_source]
        else:
            _edge_source = result['words_no_stop_words']

        self.keywords = util.sort_words(_vertex_source, _edge_source, window=window, pagerank_config=pagerank_config)

    def get_keywords(self, num=6, word_min_len=1):#获取关键词

        result = []
        count = 0
        for item in self.keywords:
            if count >= num:
                break
            if len(item.word) >= word_min_len:
                result.append(item)
                count += 1
        return result

    def get_keyphrases(self, keywords_num=12, min_occur_num=2):#这里用来获取关键短语
        keywords_set = set([item.word for item in self.get_keywords(num=keywords_num, word_min_len=1)])
        keyphrases = set()
        for sentence in self.words_no_filter:
            one = []
            for word in sentence:
                if word in keywords_set:
                    one.append(word)
                else:
                    if len(one) > 1:
                        keyphrases.add(''.join(one))
                    if len(one) == 0:
                        continue
                    else:
                        one = []
            # 兜底
            if len(one) > 1:
                keyphrases.add(''.join(one))

        return [phrase for phrase in keyphrases
                if self.text.count(phrase) >= min_occur_num]
Esempio n. 30
0
class SentenceExtraction(object):
	"""docstring for SentenceExtraction"""
	def __init__(self, stop_words_file = None, delimiters='?!;?!。;…\n'):
		'''
		stop_words_file: 默认为None,若设置为文件路径,将由该文件构造停止词过滤器
		delimiters: 分隔符集合
		'''
		super(SentenceExtraction, self).__init__()
		self.seg = Segmentation(stop_words_file=stop_words_file, delimiters=delimiters)
		self.sentences = None
		self.words_no_filter = None
		self.words_no_stop_words = None
		self.words_all_filters = None

		self.graph = None
		self.key_sentences = None

	def train(self, text, lower = False, speech_tag_filter = True,source = 'all_filters',sim_func='Standard'):
		'''
		text: 待处理文本
		lower: 是否将文本转化为小写
		speech_tag_filter:若为True,则使用默认的self.default_speech_tag_filter过滤,
						若为list,则使用speech_tag_filter过滤
						否则不过滤
		source:(数据源)选择哪个分词结果来生成句子之间相似度
						默认值为`'all_filters'`,可选值为`'no_filter', 'no_stop_words', 'all_filters'
		'''
		self.key_sentences = []
		(self.sentences, 
		self.words_no_filter, 
		self.words_no_stop_words, 
		self.words_all_filters) = self.seg.segment_text(text=text, lower=lower, speech_tag_filter=speech_tag_filter)

		if source == 'no_filter':
			source = self.words_no_filter
		elif source == 'all_filters':
			source = self.words_all_filters
		else:
			source = self.words_no_stop_words

		if sim_func == 'Standard':
			sim_function = self._get_similarity_standard
		elif sim_func == 'Levenshtein Distance':
			sim_function = self._get_similarity_ld
		else:
			sim_function = self._get_similarity_standard
		
		sentences_num = len(source)
		self.graph = np.zeros((sentences_num,sentences_num))

		for x in xrange(sentences_num):
			for y in xrange(x,sentences_num):
				similarity = sim_function(source[x],source[y])
				self.graph[x,y] = similarity
				self.graph[y,x] = similarity

		nx_graph = nx.from_numpy_matrix(self.graph)
		scores = nx.pagerank(nx_graph)
		sorted_scores = sorted(scores.items(),key = lambda item: item[1],reverse=True)

		for index,_ in sorted_scores:
			self.key_sentences.append(self.sentences[index])

	def _get_similarity_standard(self, sentence1, sentence2):
		'''
		计算句子相似度,sentence1,sentence2为待计算的两句子
		'''
		words = list(set(sentence1+sentence2))
		vector1 = [float(sentence1.count(word)) for word in words]
		vector2 = [float(sentence2.count(word)) for word in words]
		words_occur_in_common = [1 for x in xrange(len(vector1)) if vector1[x]*vector2[x] > 0.]
		num_of_common_words = sum(words_occur_in_common)

		if num_of_common_words == 0.:
			return 0.
		denominator = math.log(float(len(sentence1))) + math.log(float(len(sentence2)))
		if denominator == 0.:
			return 0.
		return num_of_common_words / denominator

	def _get_similarity_ld(self,sentence1,sentence2):
		if len(sentence1) > len(sentence2):
			sentence1,sentence2 = sentence2, sentence1
		distances = range(len(sentence1) + 1)
		for index2, char2 in enumerate(sentence2):
			newDistances = [index2 + 1]
			for index1, char1 in enumerate(sentence1):
				if char1 == char2:
					newDistances.append(distances[index1])
				else:
					newDistances.append(1 + min((distances[index1], distances[index1+1], newDistances[-1])))
			distances = newDistances
		return distances[-1]


	def get_key_sentences(self, sentences_min_len = 6, sentences_percent = '10%'):
		'''
		获取关键句子,形成摘要。
		'''
		result = []
		total_len = 0
		sentences_percent = filter(lambda x:x.isdigit(), sentences_percent)
		sentences_num = (len(self.sentences) * int(sentences_percent) )/ 100
		if sentences_num <= 0:
			sentences_num = 1
		#test
		print len(self.sentences)
		print sentences_percent
		print sentences_num

		for sentence in self.key_sentences:
			if total_len >= sentences_num:
				break
			tmp = len(sentence)
			if tmp >= sentences_min_len :
				if total_len+1  <= sentences_num:
					result.append(sentence)
					total_len += 1
		return result
Esempio n. 31
0
class TextRank4Keyword(object):
    def __init__(self, stop_words_file=None, delimiters='?!;?!。.;…\n'):
        '''
        `stop_words_file`:默认值为None,此时内部停止词表为空;可以设置为文件路径(字符串),将从停止词文件中提取停止词。
        `delimiters`:默认值是`'?!;?!。;…\n'`,用来将文本拆分为句子。
        
        self.words_no_filter:对sentences中每个句子分词而得到的两级列表。
        self.words_no_stop_words:去掉words_no_filter中的停止词而得到的两级列表。
        self.words_all_filters:保留words_no_stop_words中指定词性的单词而得到的两级列表。
        '''
        self.text = ''
        self.keywords = []

        self.seg = Segmentation(stop_words_file=stop_words_file,
                                delimiters=delimiters)

        self.words_no_filter = None  # 2维列表
        self.words_no_stop_words = None
        self.words_all_filters = None

        self.word_index = {}
        self.index_word = {}
        self.graph = None

    def train(self,
              text,
              window=2,
              lower=False,
              speech_tag_filter=True,
              vertex_source='all_filters',
              edge_source='no_stop_words'):
        '''
        `text`:文本内容,字符串。
        `window`:窗口大小,int,用来构造单词之间的边。默认值为2。
        `lower`:是否将文本转换为小写。默认为False。
        `speech_tag_filter`:若值为True,将调用内部的词性列表来过滤生成words_all_filters。
                        若值为False,words_all_filters与words_no_stop_words相同。
        `vertex_source`:选择使用words_no_filter, words_no_stop_words, words_all_filters中的哪一个来构造pagerank对应的图中的节点。
                        默认值为`'all_filters'`,可选值为`'no_filter', 'no_stop_words', 'all_filters'`。关键词也来自`vertex_source`。
        `edge_source`:选择使用words_no_filter, words_no_stop_words, words_all_filters中的哪一个来构造pagerank对应的图中的节点之间的边。
                        默认值为`'no_stop_words'`,可选值为`'no_filter', 'no_stop_words', 'all_filters'`。边的构造要结合`window`参数。
        '''

        self.text = text
        self.word_index = {}
        self.index_word = {}
        self.keywords = []
        self.graph = None
        self.words_copy = []

        (_, self.words_no_filter, self.words_no_stop_words,
         self.words_all_filters) = self.seg.segment(
             text=text, lower=lower, speech_tag_filter=speech_tag_filter)

        if vertex_source == 'no_filter':
            vertex_source = self.words_no_filter
        elif vertex_source == 'no_stop_words':
            vertex_source = self.words_no_stop_words
        else:
            vertex_source = self.words_all_filters

        if edge_source == 'no_filter':
            edge_source = self.words_no_filter
        elif vertex_source == 'all_filters':
            edge_source = self.words_all_filters
        else:
            edge_source = self.words_no_stop_words

        # print edge_source, "-----------------------------"
        for words in vertex_source:
            for word in words:
                self.words_copy.append(word)
        #print self.words_copy
        index = 0
        for words in vertex_source:
            for word in words:
                # print word
                if not self.word_index.has_key(word):
                    #print word
                    self.word_index[word] = index
                    self.index_word[index] = word
                    index += 1

        words_number = index  # 单词数量
        print index
        self.graph = np.zeros(
            (words_number, words_number))  # print np.zeros((3,4)) 方法可以构造特定的矩阵
        #  [[ 0.  0.  0.  0.]
        #   [ 0.  0.  0.  0.]
        #   [ 0.  0.  0.  0.]]

        for word_list in edge_source:
            for w1, w2 in self.combine(
                    word_list, window):  # `window`:窗口大小,int,用来构造单词之间的边。默认值为2。
                if not self.word_index.has_key(w1):  # has_key()方法查找键是否存在的
                    continue
                if not self.word_index.has_key(w2):
                    continue
                index1 = self.word_index[w1]
                index2 = self.word_index[w2]
                self.graph[index1][index2] = 1.0
                self.graph[index2][index1] = 1.0

#         for x in xrange(words_number):
#             row_sum = np.sum(self.graph[x, :])
#             if row_sum > 0:
#                 self.graph[x, :] = self.graph[x, :] / row_sum

        nx_graph = nx.from_numpy_matrix(self.graph)
        scores = nx.pagerank(nx_graph)  # this is a dict
        for word in scores:
            #print word,scores[word],self.words_copy.count(word)
            scores[word] = scores[word] * self.words_copy.count(
                self.index_word[word])
        sorted_scores = sorted(scores.items(),
                               key=lambda item: item[1],
                               reverse=True)
        #print sorted_scores
        for index, _ in sorted_scores:
            self.keywords.append(self.index_word[index])
            # print '----------------------'
            # print self.index_word[index], _
            #

    def combine(self, word_list, window=2):
        '''
        构造在window下的单词组合,用来构造单词之间的边。使用了生成器。
        word_list: 由单词组成的列表。
        windows:窗口大小。
        '''
        window = int(window)
        if window < 2: window = 2
        for x in xrange(
                1, window):  # xrange生成的是一个xrange对象 xrange生成的不是一个数组,而是一个生成器
            if x >= len(word_list):
                break
            word_list2 = word_list[x:]
            res = zip(word_list, word_list2)  # x = [1, 2, 3]
            #
            #                                     y = [4, 5, 6]
            #
            #                                     z = [7, 8, 9]
            #
            #                                     xyz = zip(x, y, z)    print xyz   [(1,4,7),(2,5,8)(3,6,9)]
            for r in res:
                yield r  # 包含yield语句的函数会被特地编译成生成器

    def get_keywords(self, num=6, word_min_len=1):
        '''
        获取最重要的num个长度大于等于word_min_len的关键词。
        返回关键词列表。
        '''
        result = []
        count = 0
        for word in self.keywords:
            if count >= num:
                # print 'OK'
                break
            if len(word) >= word_min_len:
                result.append(word)
                count += 1
        return result

    def get_keyphrases(self, keywords_num=12, min_occur_num=2):
        ''' 
        获取关键短语。
        获取 keywords_num 个关键词构造在可能出现的短语,要求这个短语在原文本中至少出现的次数为min_occur_num。
        返回关键短语的列表。
        '''
        keywords_set = set(self.get_keywords(num=keywords_num, word_min_len=1))

        keyphrases = set()
        one = []
        for sentence_list in self.words_no_filter:
            for word in sentence_list:
                # print '/'.join(one)
                # print word
                if word in keywords_set:
                    one.append(word)
                else:
                    if len(one) > 1:
                        keyphrases.add(''.join(one))
                        one = []
                        continue
                    one = []

        return [
            phrase for phrase in keyphrases
            if self.text.count(phrase) >= min_occur_num
        ]
Esempio n. 32
0
class TextRank4Keyword(object):
    
    def __init__(self, stop_words_file = None, delimiters = '?!;?!。;…\n'):
        '''
        `stop_words_file`:默认值为None,此时内部停止词表为空;可以设置为文件路径(字符串),将从停止词文件中提取停止词。
        `delimiters`:默认值是`'?!;?!。;…\n'`,用来将文本拆分为句子。
        
        self.words_no_filter:对sentences中每个句子分词而得到的两级列表。
        self.words_no_stop_words:去掉words_no_filter中的停止词而得到的两级列表。
        self.words_all_filters:保留words_no_stop_words中指定词性的单词而得到的两级列表。
        '''
        self.text = ''
        self.keywords = []
        
        self.seg = Segmentation(stop_words_file=stop_words_file, delimiters=delimiters)
        
        self.words_no_filter = None     # 2维列表
        self.words_no_stop_words = None
        self.words_all_filters = None
        
        self.word_index = {}
        self.index_word = {}
        self.graph = None
        
    def train(self, text, window = 2, lower = False, speech_tag_filter=True, 
              vertex_source = 'all_filters',
              edge_source = 'no_stop_words'):
        '''
        `text`:文本内容,字符串。
        `window`:窗口大小,int,用来构造单词之间的边。默认值为2。
        `lower`:是否将文本转换为小写。默认为False。
        `speech_tag_filter`:若值为True,将调用内部的词性列表来过滤生成words_all_filters。
                        若值为False,words_all_filters与words_no_stop_words相同。
        `vertex_source`:选择使用words_no_filter, words_no_stop_words, words_all_filters中的哪一个来构造pagerank对应的图中的节点。
                        默认值为`'all_filters'`,可选值为`'no_filter', 'no_stop_words', 'all_filters'`。关键词也来自`vertex_source`。
        `edge_source`:选择使用words_no_filter, words_no_stop_words, words_all_filters中的哪一个来构造pagerank对应的图中的节点之间的边。
                        默认值为`'no_stop_words'`,可选值为`'no_filter', 'no_stop_words', 'all_filters'`。边的构造要结合`window`参数。
        '''
        
        self.text = text
        self.word_index = {}
        self.index_word = {}
        self.keywords = []
        self.graph = None
        
        (_, self.words_no_filter, self.words_no_stop_words, self.words_all_filters) = self.seg.segment(text=text, 
                                                                                                     lower=lower, 
                                                                                                     speech_tag_filter=speech_tag_filter)
        
        if vertex_source == 'no_filter':
            vertex_source = self.words_no_filter
        elif vertex_source == 'no_stop_words':
            vertex_source = self.words_no_stop_words
        else:
            vertex_source = self.words_all_filters

        if edge_source == 'no_filter':
            edge_source = self.words_no_filter
        elif vertex_source == 'all_filters':
            edge_source = self.words_all_filters
        else:
            edge_source = self.words_no_stop_words
            
        
        
        index = 0
        for words in vertex_source:
            for word in words:
                if not self.word_index.has_key(word):
                    self.word_index[word] = index
                    self.index_word[index] = word
                    index += 1
        
        words_number = index # 单词数量
        self.graph = np.zeros((words_number, words_number))
        
        for word_list in edge_source:
            for w1, w2 in self.combine(word_list, window):
                if not self.word_index.has_key(w1):
                    continue
                if not self.word_index.has_key(w2):
                    continue
                index1 = self.word_index[w1]
                index2 = self.word_index[w2]
                self.graph[index1][index2] = 1.0
                self.graph[index2][index1] = 1.0
        
#         for x in xrange(words_number):
#             row_sum = np.sum(self.graph[x, :])
#             if row_sum > 0:
#                 self.graph[x, :] = self.graph[x, :] / row_sum
        
        nx_graph = nx.from_numpy_matrix(self.graph)
        scores = nx.pagerank(nx_graph) # this is a dict
        sorted_scores = sorted(scores.items(), key = lambda item: item[1], reverse=True)
        for index, _ in sorted_scores:
            self.keywords.append(self.index_word[index])
            
        
 
    
    def combine(self, word_list, window = 2):
        '''
        构造在window下的单词组合,用来构造单词之间的边。使用了生成器。
        word_list: 由单词组成的列表。
        windows:窗口大小。
        '''
        window = int(window)
        if window < 2: window = 2
        for x in xrange(1, window):
            if x >= len(word_list):
                break
            word_list2 = word_list[x:]
            res = zip(word_list, word_list2)
            for r in res:
                yield r
    
    def get_keywords(self, num = 6, word_min_len = 1):
        '''
        获取最重要的num个长度大于等于word_min_len的关键词。
        返回关键词列表。
        '''
        result = []
        count = 0
        for word in self.keywords:
            if count >= num:
                break
            if len(word) >= word_min_len:
                result.append(word)
                count += 1
        return result
    
    def get_keyphrases(self, keywords_num = 12, min_occur_num = 2): 
        ''' 
        获取关键短语。
        获取 keywords_num 个关键词构造在可能出现的短语,要求这个短语在原文本中至少出现的次数为min_occur_num。
        返回关键短语的列表。
        '''
        keywords_set = set(self.get_keywords(num=keywords_num, word_min_len = 1))
            
        keyphrases = set()
        one = []
        for sentence_list in self.words_no_filter:
            for word in sentence_list:
                # print '/'.join(one)
                # print word
                if word in keywords_set:
                    one.append(word)
                else:
                    if len(one)>1:
                        keyphrases.add(''.join(one))
                        one = []
                        continue
                    one = []
                    
        return [phrase for phrase in keyphrases 
                if self.text.count(phrase) >= min_occur_num]
Esempio n. 33
0
class TextRank4Keyword(object):
    
    def __init__(self, stop_words_file = None, 
                 allow_speech_tags = util.allow_speech_tags, 
                 delimiters = util.sentence_delimiters):
        """
        Keyword arguments:
        stop_words_file  --  str,指定停止词文件路径(一行一个停止词),若为其他类型,则使用默认停止词文件
        delimiters       --  默认值是`?!;?!。;…\n`,用来将文本拆分为句子。
        
        Object Var:
        self.words_no_filter      --  对sentences中每个句子分词而得到的两级列表。
        self.words_no_stop_words  --  去掉words_no_filter中的停止词而得到的两级列表。
        self.words_all_filters    --  保留words_no_stop_words中指定词性的单词而得到的两级列表。
        """
        self.text = ''
        self.keywords = None
        
        self.seg = Segmentation(stop_words_file=stop_words_file, 
                                allow_speech_tags=allow_speech_tags, 
                                delimiters=delimiters)

        self.sentences = None
        self.words_no_filter = None     # 2维列表
        self.words_no_stop_words = None
        self.words_all_filters = None
        
    def analyze(self, text, 
                window = 2, 
                lower = False,
                vertex_source = 'all_filters',
                edge_source = 'no_stop_words',
                pagerank_config = {'alpha': 0.85,}):
        """分析文本

        Keyword arguments:
        text       --  文本内容,字符串。
        window     --  窗口大小,int,用来构造单词之间的边。默认值为2。
        lower      --  是否将文本转换为小写。默认为False。
        vertex_source   --  选择使用words_no_filter, words_no_stop_words, words_all_filters中的哪一个来构造pagerank对应的图中的节点。
                            默认值为`'all_filters'`,可选值为`'no_filter', 'no_stop_words', 'all_filters'`。关键词也来自`vertex_source`。
        edge_source     --  选择使用words_no_filter, words_no_stop_words, words_all_filters中的哪一个来构造pagerank对应的图中的节点之间的边。
                            默认值为`'no_stop_words'`,可选值为`'no_filter', 'no_stop_words', 'all_filters'`。边的构造要结合`window`参数。
        """
        
        # self.text = util.as_text(text)
        self.text = text
        self.word_index = {}
        self.index_word = {}
        self.keywords = []
        self.graph = None
        
        result = self.seg.segment(text=text, lower=lower)
        self.sentences = result.sentences
        self.words_no_filter = result.words_no_filter
        self.words_no_stop_words = result.words_no_stop_words
        self.words_all_filters   = result.words_all_filters

        options = ['no_filter', 'no_stop_words', 'all_filters']

        if vertex_source in options:
            _vertex_source = result['words_'+vertex_source]
        else:
            _vertex_source = result['words_all_filters']

        if edge_source in options:
            _edge_source   = result['words_'+edge_source]
        else:
            _edge_source   = result['words_no_stop_words']

        self.keywords = util.sort_words(_vertex_source, _edge_source, window = window, pagerank_config = pagerank_config)

    def get_keywords(self, num = 6, word_min_len = 1):
        """获取最重要的num个长度大于等于word_min_len的关键词。

        Return:
        关键词列表。
        """
        result = []
        count = 0
        for item in self.keywords:
            if count >= num:
                break
            if len(item.word) >= word_min_len:
                result.append(item)
                count += 1
        return result
    
    def get_keyphrases(self, keywords_num = 12, min_occur_num = 2): 
        """获取关键短语。
        获取 keywords_num 个关键词构造的可能出现的短语,要求这个短语在原文本中至少出现的次数为min_occur_num。

        Return:
        关键短语的列表。
        """
        keywords_set = set([ item.word for item in self.get_keywords(num=keywords_num, word_min_len = 1)])
        keyphrases = set()
        for sentence in self.words_no_filter:
            one = []
            for word in sentence:
                if word in keywords_set:
                    one.append(word)
                else:
                    if len(one) >  1:
                        keyphrases.add(''.join(one))
                    if len(one) == 0:
                        continue
                    else:
                        one = []
            # 兜底
            if len(one) >  1:
                keyphrases.add(''.join(one))

        return [phrase for phrase in keyphrases 
                if self.text.count(phrase) >= min_occur_num]
Esempio n. 34
0
class KeywordExtraction(object):
    """ 关键词提取 """

    def __init__(self, stop_words_file=None, delimiters="?!;?!。;…\n"):
        """
		stop_words_file: 默认为None,若设置为文件路径,将由该文件构造停止词过滤器
		delimiters: 分隔符集合
		"""
        super(KeywordExtraction, self).__init__()
        """变量说明
		self.keywords: 关键词列表
        self.words_no_filter:对text进行分词而得到的两级列表(每行为一个句子的分词结果列表)。
        self.words_no_stop_words:对text进行分词,同时去掉停止词而得到的两级列表。
        self.words_all_filters:对text进行分词,同时去停止词,保留指定词性的单词而得到的两级列表。
        self.word_index: 字典(单词-下标)
        self.index_word: 字典(下标-单词)
        self.graph: 由单词构成的图,用于pagerank算法
		"""
        self.text = ""
        self.keywords = []
        self.seg = Segmentation(stop_words_file=stop_words_file, delimiters=delimiters)

        self.words_no_filter = None
        self.words_no_stop_words = None
        self.words_all_filters = None

        self.word_index = {}
        self.index_word = {}
        self.graph = None

    def combine(self, word_list, window=2):
        """函数功能:构造在window窗口长度下的单词组合,用来构造self.graph中单词之间的边。
        word_list: 由单词组成的列表。
        windows:窗口大小。
		"""
        window = int(window)
        if window < 2:
            window = 2
        for x in xrange(1, window):
            if x >= len(word_list):
                break
            word_list2 = word_list[x:]
            # zip()返回tuple词组
            result = zip(word_list, word_list2)
            # 使用生成器yield
            for res in result:
                yield res

    def train(
        self,
        text,
        window=2,
        lower=False,
        speech_tag_filter=True,
        vertex_source="all_filters",
        edge_source="no_stop_words",
    ):
        """
		函数功能:构建self.graph所需的节点和边,使用pagerank算法进行训练,返回按得分降序的词列表
        text: 待处理文本,字符串
        window:窗口大小,用来构造单词之间的边。默认值为2
        lower:是否将文本转换为小写。默认为False。
        speech_tag_filter:若为True,则使用默认的self.default_speech_tag_filter过滤,
						若为list,则使用speech_tag_filter过滤
						否则不过滤
        vertex_source:(节点源)选择哪个分词结果来构造pagerank对应的图中的节点
                        默认值为'all_filters',可选值为'no_filter', 'no_stop_words', 'all_filters'
                        关键词也来自vertex_source
        edge_source:(边源)选择使用哪个分词结果来构造pagerank对应的图中的节点之间的边
                        默认值为'no_stop_words',可选值为'no_filter', 'no_stop_words', 'all_filters'边的构造要结合window参数
		"""
        self.text = text
        (_, self.words_no_filter, self.words_no_stop_words, self.words_all_filters) = self.seg.segment_text(
            text=text, lower=lower, speech_tag_filter=speech_tag_filter
        )

        if vertex_source == "no_filter":
            vertex_source = self.words_no_filter
        elif vertex_source == "no_stop_words":
            vertex_source = self.words_no_stop_words
        else:
            vertex_source = self.words_all_filters

        if edge_source == "no_filter":
            edge_source = self.words_no_filter
        elif edge_source == "no_stop_words":
            edge_source = self.words_no_stop_words
        else:
            edge_source = self.words_all_filters
            # 构造节点
        index = 0
        for words in vertex_source:
            for word in words:
                if not self.word_index.has_key(word):
                    self.word_index[word] = index
                    self.index_word[index] = word
                    index += 1

                    # 构造图
        words_number = index
        self.graph = np.zeros((words_number, words_number))  # matrix

        # 构造边
        for word_list in edge_source:
            for w1, w2 in self.combine(word_list, window):
                if not self.word_index.has_key(w1):
                    continue
                if not self.word_index.has_key(w2):
                    continue
                index1 = self.word_index[w1]
                index2 = self.word_index[w2]
                self.graph[index1][index2] = 1.0
                self.graph[index2][index1] = 1.0
                # 使用networkx库的pagerank算法
        nx_graph = nx.from_numpy_matrix(self.graph)
        scores = nx.pagerank(nx_graph)

        # 对各词得分进行排序
        sorted_scores = sorted(scores.items(), key=lambda item: item[1], reverse=True)
        for index, _ in sorted_scores:
            self.keywords.append(self.index_word[index])

    def get_keywords(self, num=6, word_min_len=1):
        """函数功能:获取关键词列表
		num: 关键词个数
		word_min_len: 关键词最小长度
		"""
        result = []
        count = 0
        for word in self.keywords:
            if count >= num:
                break
            if len(word) >= word_min_len:
                result.append(word)
                count += 1
        return result

    def get_keyphrases(self, keywords_num=12, min_occur_num=2):
        """函数功能:获取关键短语列表
		keywords_num: 关键词个数
		word_min_len: 关键词在原文中至少出现次数
		"""
        keywords_set = set(self.get_keywords(num=keywords_num, word_min_len=1))
        keyphrases = set()
        one_word = []
        for sentence_list in self.words_no_filter:
            for word in sentence_list:
                if word in keywords_set:
                    one_word.append(word)
                else:
                    if len(one_word) > 1:
                        keyphrases.add("".join(one_word))
                        one_word = []
                        continue
                    one_word = []
        result = [phrase for phrase in keyphrases if self.text.count(phrase) >= min_occur_num]
        return result
Esempio n. 35
0
from miscFuncs import cleanFilesOfPrevSess


info="Single-test repeated experiment"#any info about the test
#Setting train-validation-test split ratios, 
# if 'test' exists as a static folder, the first two values are used 
# and rest of the data is split taking into account their ratio
splitRatios=[0.65,0.15,0.20]

#This flag defines if the same number of samples will be used for each class in training
#   If True, data augmentation(via up-sampling some existing files) will be carried
#   to have balanced set for training and validation. Not applicable to test files
useBalancedData=True

#Define segmentation strategy
async2secSegments=Segmentation("None",periodSync=False,sizeType="fixed",frameSizeMs=2000.0,hopSizeMs=1000.0)
async3secSegments=Segmentation("None",periodSync=False,sizeType="fixed",frameSizeMs=3000.0,hopSizeMs=1000.0)
segStrategies=[async2secSegments,async3secSegments]

#Define features to be used
features=[]
for featName in ['SubEnv']:#other options: 'MFCC','MelSpec'
    for segType in segStrategies:
        for timeDim in [32,64]:
            for freqDim in [16]:
                features.append(Feature(featName,[timeDim,freqDim],"frame",segType,involveDelta=False))


#Define data specifications for this database
data=Data(dbaName,dataFolder,featureFolder,features,useBalancedData,splitRatios,info)
#Defining NN model with a name. 
Esempio n. 36
0
class TextRank4Sentence(object):
    def __init__(self, stop_words_file=None, delimiters='?!;?!。;…\n'):
        '''
        `stop_words_file`:默认值为None,此时内部停止词表为空;可以设置为文件路径(字符串),将从停止词文件中提取停止词。
        `delimiters`:默认值是`'?!;?!。;…\n'`,用来将文本拆分为句子。
        
        self.sentences:由句子组成的列表。
        self.words_no_filter:对sentences中每个句子分词而得到的两级列表。
        self.words_no_stop_words:去掉words_no_filter中的停止词而得到的两级列表。
        self.words_all_filters:保留words_no_stop_words中指定词性的单词而得到的两级列表。
        '''
        self.seg = Segmentation(stop_words_file=stop_words_file,
                                delimiters=delimiters)

        self.sentences = None
        self.words_no_filter = None  # 2维列表
        self.words_no_stop_words = None
        self.words_all_filters = None

        self.graph = None
        self.key_sentences = None

    def train(self,
              text,
              lower=False,
              speech_tag_filter=True,
              source='no_stop_words',
              sim_func='standard'):
        ''' 
       `text`:文本内容,字符串。
        `lower`:是否将文本转换为小写。默认为False。
        `speech_tag_filter`:若值为True,将调用内部的词性列表来过滤生成words_all_filters。
                        若值为False,words_all_filters与words_no_stop_words相同。
        `source`:选择使用words_no_filter, words_no_stop_words, words_all_filters中的哪一个来生成句子之间的相似度。
                默认值为`'all_filters'`,可选值为`'no_filter', 'no_stop_words', 'all_filters'`。
        `sim_func`: 指定计算句子相似度的函数。当前只有一个函数,对应默认值`standard`。
        '''

        self.key_sentences = []

        (self.sentences, self.words_no_filter, self.words_no_stop_words,
         self.words_all_filters) = self.seg.segment(
             text=text, lower=lower, speech_tag_filter=speech_tag_filter)
        # -

        # print self.sentences

        if source == 'no_filter':
            source = self.words_no_filter
        elif source == 'all_filters':
            source = self.words_all_filters
        else:
            source = self.words_no_stop_words

        sim_func = self._get_similarity_standard

        sentences_num = len(source)

        self.graph = np.zeros((sentences_num, sentences_num))

        for x in xrange(sentences_num):
            for y in xrange(x, sentences_num):
                similarity = sim_func(source[x], source[y])
                self.graph[x, y] = similarity
                self.graph[y, x] = similarity

#         for x in xrange(sentences_num):
#             row_sum = np.sum(self.graph[x, :])
#             if row_sum > 0:
#                 self.graph[x, :] = self.graph[x, :] / row_sum

# print self.graph

        nx_graph = nx.from_numpy_matrix(self.graph)
        scores = nx.pagerank(nx_graph)  # this is a dict
        sorted_scores = sorted(scores.items(),
                               key=lambda item: item[1],
                               reverse=True)
        #  data为item,按照value排序
        # print sorted_scoresiterable:是可迭代类型;

        # cmp:用于比较的函数,比较什么由key决定;
        # key:用列表元素的某个属性或函数进行作为关键字,有默认值,迭代集合中的一项;
        # reverse:排序规则. reverse = True  降序 或者 reverse = False 升序,有默认值。
        # 返回值:是一个经过排序的可迭代类型,与iterable一样。

        for index, _ in sorted_scores:
            self.key_sentences.append(self.sentences[index])

        # print '\n'.join(self.key_sentences)

    def _get_similarity_standard(self, word_list1, word_list2):
        ''' 
        默认的用于计算两个句子相似度的函数。
        word_list1, word_list2: 分别代表两个句子,都是由单词组成的列表
        '''
        vector1, vector2 = self._gen_vectors(word_list1, word_list2)

        # print vector1, vector2

        vector3 = [vector1[x] * vector2[x] for x in xrange(len(vector1))]
        vector4 = [1 for num in vector3 if num > 0.]
        co_occur_num = sum(vector4)

        # print co_occur_num

        if co_occur_num == 0.:
            return 0.

        denominator = math.log(float(len(word_list1))) + math.log(
            float(len(word_list2)))  # 分母

        if denominator == 0.:
            return 0.

        return co_occur_num / denominator

    def _gen_vectors(self, word_list1, word_list2):
        '''
        两个句子转换成两个同样大小向量。可以通过这两个向量来计算两个句子的相似度。
        word_list1, word_list2: 分别代表两个句子,都是由单词组成的列表
        '''
        words = list(set(word_list1 + word_list2))
        vector1 = [float(word_list1.count(word)) for word in words]
        vector2 = [float(word_list2.count(word)) for word in words]
        return vector1, vector2

    def get_key_sentences(self, num=6, sentence_min_len=6):
        '''
        获取最重要的num个长度大于等于sentence_min_len的句子用来生成摘要。
        返回列表。
        '''
        result = []
        count = 0
        for sentence in self.key_sentences:
            if count >= num:
                break
            if len(sentence) >= sentence_min_len:
                result.append(sentence)
                count += 1
        return result


# if __name__ == '__main__':
#
#     import codecs
#     # text = codecs.open('../text/03.txt', 'r', 'utf-8').read()
#     text = "这间酒店位于北京东三环,里面摆放很多雕塑,文艺气息十足。答谢宴于晚上8点开始。"
#     tr4s = TextRank4Sentence(stop_words_file='../stopword.data')
#     tr4s.train(text=text, speech_tag_filter=True, lower=True, source = 'all_filters')
#     print '\n'.join(tr4s.get_key_sentences(num=1))
#
#     print '\n'.join(tr4s.sentences)
#     for wl in tr4s.words_no_filter:
#         print '[', ', \''.join(wl), ']'
#     print
#     for wl in tr4s.words_no_stop_words:
#         print '[', ', \''.join(wl), ']'
#     print
#     for wl in tr4s.words_all_filters:
#         print '[', ', \''.join(wl), ']'
class TextRank4Sentence(object):
    
    def __init__(self, stop_words_file = None, 
                 allow_speech_tags = util.allow_speech_tags,
                 delimiters = util.sentence_delimiters):
        """
        Keyword arguments:
        stop_words_file  --  str,停止词文件路径,若不是str则是使用默认停止词文件
        delimiters       --  默认值是`?!;?!。;…\n`,用来将文本拆分为句子。
        
        Object Var:
        self.sentences               --  由句子组成的列表。
        self.words_no_filter         --  对sentences中每个句子分词而得到的两级列表。
        self.words_no_stop_words     --  去掉words_no_filter中的停止词而得到的两级列表。
        self.words_all_filters       --  保留words_no_stop_words中指定词性的单词而得到的两级列表。
        """
        self.seg = Segmentation(stop_words_file=stop_words_file,
                                allow_speech_tags=allow_speech_tags,
                                delimiters=delimiters)
        
        self.sentences = None
        self.words_no_filter = None     # 2维列表
        self.words_no_stop_words = None
        self.words_all_filters = None
        
        self.key_sentences = None
        
    def analyze(self, text, lower = False, 
              source = 'no_stop_words', 
              sim_func = util.get_similarity,
              pagerank_config = {'alpha': 0.85,}):
        """
        Keyword arguments:
        text                 --  文本内容,字符串。
        lower                --  是否将文本转换为小写。默认为False。
        source               --  选择使用words_no_filter, words_no_stop_words, words_all_filters中的哪一个来生成句子之间的相似度。
                                 默认值为`'all_filters'`,可选值为`'no_filter', 'no_stop_words', 'all_filters'`。
        sim_func             --  指定计算句子相似度的函数。
        """
        
        self.key_sentences = []
        
        result = self.seg.segment(text=text, lower=lower)
        self.sentences = result.sentences
        self.words_no_filter = result.words_no_filter
        self.words_no_stop_words = result.words_no_stop_words
        self.words_all_filters   = result.words_all_filters

        options = ['no_filter', 'no_stop_words', 'all_filters']
        if source in options:
            _source = result['words_'+source]
        else:
            _source = result['words_no_stop_words']

        self.key_sentences = util.sort_sentences(sentences = self.sentences,
                                                 words     = _source,
                                                 sim_func  = sim_func,
                                                 pagerank_config = pagerank_config)

            
    def get_key_sentences(self, num = 6, sentence_min_len = 6):
        """获取最重要的num个长度大于等于sentence_min_len的句子用来生成摘要。

        Return:
        多个句子组成的列表。
        """
        result = []
        count = 0
        for item in self.key_sentences:
            if count >= num:
                break
            if len(item['sentence']) >= sentence_min_len:
                result.append(item)
                count += 1
        return result
Esempio n. 38
0
class TextRank4Sentence(object):
    def __init__(self, stop_words_file=None, delimiters='?!;?!。;…\n'):
        '''
        `stop_words_file`:默认值为None,此时内部停止词表为空;可以设置为文件路径(字符串),将从停止词文件中提取停止词。
        `delimiters`:默认值是`'?!;?!。;…\n'`,用来将文本拆分为句子。
        
        self.sentences:由句子组成的列表。
        self.words_no_filter:对sentences中每个句子分词而得到的两级列表。
        self.words_no_stop_words:去掉words_no_filter中的停止词而得到的两级列表。
        self.words_all_filters:保留words_no_stop_words中指定词性的单词而得到的两级列表。
        '''
        self.seg = Segmentation(stop_words_file=stop_words_file,
                                delimiters=delimiters)

        self.sentences = None
        self.words_no_filter = None  # 2维列表
        self.words_no_stop_words = None
        self.words_all_filters = None

        self.graph = None
        self.key_sentences = None

    def train(self,
              text,
              lower=False,
              speech_tag_filter=True,
              source='no_stop_words',
              sim_func='standard'):
        ''' 
       `text`:文本内容,字符串。
        `lower`:是否将文本转换为小写。默认为False。
        `speech_tag_filter`:若值为True,将调用内部的词性列表来过滤生成words_all_filters。
                        若值为False,words_all_filters与words_no_stop_words相同。
        `source`:选择使用words_no_filter, words_no_stop_words, words_all_filters中的哪一个来生成句子之间的相似度。
                默认值为`'all_filters'`,可选值为`'no_filter', 'no_stop_words', 'all_filters'`。
        `sim_func`: 指定计算句子相似度的函数。当前只有一个函数,对应默认值`standard`。
        '''

        self.key_sentences = []

        (self.sentences, self.words_no_filter, self.words_no_stop_words,
         self.words_all_filters) = self.seg.segment(
             text=text, lower=lower, speech_tag_filter=speech_tag_filter)
        # -

        # print self.sentences

        if source == 'no_filter':
            source = self.words_no_filter
        elif source == 'all_filters':
            source = self.words_all_filters
        else:
            source = self.words_no_stop_words

        sim_func = self._get_similarity_standard

        sentences_num = len(source)

        self.graph = np.zeros((sentences_num, sentences_num))

        for x in xrange(sentences_num):
            for y in xrange(x, sentences_num):
                similarity = sim_func(source[x], source[y])
                self.graph[x, y] = similarity
                self.graph[y, x] = similarity


#         for x in xrange(sentences_num):
#             row_sum = np.sum(self.graph[x, :])
#             if row_sum > 0:
#                 self.graph[x, :] = self.graph[x, :] / row_sum

# print self.graph

        nx_graph = nx.from_numpy_matrix(self.graph)
        scores = nx.pagerank(nx_graph)  # this is a dict
        sorted_scores = sorted(scores.items(),
                               key=lambda item: item[1],
                               reverse=True)

        # print sorted_scores

        for index, _ in sorted_scores:
            self.key_sentences.append(self.sentences[index])

        # print '\n'.join(self.key_sentences)

    def _get_similarity_standard(self, word_list1, word_list2):
        ''' 
        默认的用于计算两个句子相似度的函数。
        word_list1, word_list2: 分别代表两个句子,都是由单词组成的列表
        '''
        vector1, vector2 = self._gen_vectors(word_list1, word_list2)

        # print vector1, vector2

        vector3 = [vector1[x] * vector2[x] for x in xrange(len(vector1))]
        vector4 = [1 for num in vector3 if num > 0.]
        co_occur_num = sum(vector4)

        # print co_occur_num

        if co_occur_num == 0.:
            return 0.

        denominator = math.log(float(len(word_list1))) + math.log(
            float(len(word_list2)))  # 分母

        if denominator == 0.:
            return 0.

        return co_occur_num / denominator

    def _gen_vectors(self, word_list1, word_list2):
        '''
        两个句子转换成两个同样大小向量。可以通过这两个向量来计算两个句子的相似度。
        word_list1, word_list2: 分别代表两个句子,都是由单词组成的列表
        '''
        words = list(set(word_list1 + word_list2))
        vector1 = [float(word_list1.count(word)) for word in words]
        vector2 = [float(word_list2.count(word)) for word in words]
        return vector1, vector2

    def get_key_sentences(self, num=6, sentence_min_len=6):
        '''
        获取最重要的num个长度大于等于sentence_min_len的句子用来生成摘要。
        返回列表。
        '''
        result = []
        count = 0
        for sentence in self.key_sentences:
            if count >= num:
                break
            if len(sentence) >= sentence_min_len:
                result.append(sentence)
                count += 1
        return result
Esempio n. 39
0
class TextRank4Sentence(object):
    
    def __init__(self, stop_words_file = None, delimiters='?!;?!。;…\n'):
        '''
        `stop_words_file`:默认值为None,此时内部停止词表为空;可以设置为文件路径(字符串),将从停止词文件中提取停止词。
        `delimiters`:默认值是`'?!;?!。;…\n'`,用来将文本拆分为句子。
        
        self.sentences:由句子组成的列表。
        self.words_no_filter:对sentences中每个句子分词而得到的两级列表。
        self.words_no_stop_words:去掉words_no_filter中的停止词而得到的两级列表。
        self.words_all_filters:保留words_no_stop_words中指定词性的单词而得到的两级列表。
        '''
        self.seg = Segmentation(stop_words_file=stop_words_file, delimiters=delimiters)
        
        self.sentences = None
        self.words_no_filter = None     # 2维列表
        self.words_no_stop_words = None
        self.words_all_filters = None
        
        self.graph = None
        self.key_sentences = None
        self.key_weight = None
        self.type = 1
    def Clear(self):
        self.sentences = None
        self.words_no_filter = None     # 2维列表
        self.words_no_stop_words = None
        self.words_all_filters = None
        
        self.graph = None
        self.key_sentences = None
        self.key_weight = None

    def GetSentenceSim(self, title1, title2):
        sim_func = self._get_similarity_standard
        (sentences1, words_no_filter1, words_no_stop_words1, words_all_filters1) = self.seg.segment(text=title1,lower=False, speech_tag_filter=True)
        (sentences2, words_no_filter2, words_no_stop_words2, words_all_filters2) = self.seg.segment(text=title2,lower=False,speech_tag_filter=True)
        source1 = words_no_filter1
        source2 = words_no_filter2
        similarity = sim_func(source1[0], source2[0])
        #print '1 '+title1+'\n2 '+title2+'\n sim '+bytes(similarity)
        return similarity

    def train_weight(self,doc):
        self.type = 1
        self.key_sentences = []
        self.key_weight = []
        
        (self.sentences,self.words_all_filters,weight) = self.seg.segment_sentences_weight(text=doc)
        #print doc['title']
        (title) = self.seg.segment_sentence(sentence=doc['title'])
        #print title
        source = self.words_all_filters
        sim_func = self._get_similarity_standard
        
        sentences_num = len(source)
        
        self.graph = np.zeros((sentences_num, sentences_num))

        #import pdb

        weights = []
        summary = 0
        #print self.sentences[0]
        #pdb.set_trace()
        for x in xrange(sentences_num):
            lanlan = sim_func(source[x], title[0])
            w=weight[x]*lanlan
            weights.append(x)
            weights.append(w)
            summary+=w
            #print w
        if summary!=0 :
            dicts = {weights[i]: weights[i+1]/summary for i in range(0, len(weights), 2)}

        #pdb.set_trace()
        for x in xrange(sentences_num):
            for y in xrange(x, sentences_num):
                similarity = sim_func(source[x], source[y])
                self.graph[x, y] = similarity
                self.graph[y, x] = similarity
        #pdb.set_trace()        
#         for x in xrange(sentences_num):
#             row_sum = np.sum(self.graph[x, :])
#             if row_sum > 0:
#                 self.graph[x, :] = self.graph[x, :] / row_sum
                
        nx_graph = nx.from_numpy_matrix(self.graph)
        if summary!=0:
            scores = nx.pagerank(G=nx_graph,personalization=dicts)
        else:
            scores = nx.pagerank(G=nx_graph)
        sorted_scores = sorted(scores.items(), key = lambda item: item[1], reverse=True)
        
        # print sorted_scores
        
        for index, _ in sorted_scores:
            self.key_sentences.append(self.sentences[index])
            self.key_weight.append(weight[index])
            
        # print '\n'.join(self.key_sentences)

    def train(self, text, lower = False, speech_tag_filter=True,
              source = 'no_stop_words', sim_func = 'standard'):
        ''' 
       `text`:文本内容,字符串。
        `lower`:是否将文本转换为小写。默认为False。
        `speech_tag_filter`:若值为True,将调用内部的词性列表来过滤生成words_all_filters。
                        若值为False,words_all_filters与words_no_stop_words相同。
        `source`:选择使用words_no_filter, words_no_stop_words, words_all_filters中的哪一个来生成句子之间的相似度。
                默认值为`'all_filters'`,可选值为`'no_filter', 'no_stop_words', 'all_filters'`。
        `sim_func`: 指定计算句子相似度的函数。当前只有一个函数,对应默认值`standard`。
        '''
        self.type = 2
        self.key_sentences = []
        self.key_weight = []
        
        (self.sentences, self.words_no_filter, self.words_no_stop_words, self.words_all_filters) = self.seg.segment(text=text,  
                                                                                                                    lower=lower, 
                                                                                                                    speech_tag_filter=speech_tag_filter)
        # -
        
        # print self.sentences   
                                                                                                          
        if source == 'no_filter':
            source = self.words_no_filter
        elif source == 'all_filters':
            source = self.words_all_filters
        else:
            source = self.words_no_stop_words
            
        sim_func = self._get_similarity_standard
        
        sentences_num = len(source)
        
        self.graph = np.zeros((sentences_num, sentences_num))
        
        for x in xrange(sentences_num):
            for y in xrange(x, sentences_num):
                similarity = sim_func(source[x], source[y])
                self.graph[x, y] = similarity
                self.graph[y, x] = similarity
                
#         for x in xrange(sentences_num):
#             row_sum = np.sum(self.graph[x, :])
#             if row_sum > 0:
#                 self.graph[x, :] = self.graph[x, :] / row_sum
                
        # print self.graph
                
        nx_graph = nx.from_numpy_matrix(self.graph)
        scores = nx.pagerank(nx_graph) # this is a dict
        sorted_scores = sorted(scores.items(), key = lambda item: item[1], reverse=True)
        
        # print sorted_scores
        
        for index, _ in sorted_scores:
            self.key_sentences.append(self.sentences[index])

    def _get_similarity_standard(self, word_list1, word_list2):
        ''' 
        默认的用于计算两个句子相似度的函数。
        word_list1, word_list2: 分别代表两个句子,都是由单词组成的列表
        '''
        vector1, vector2 =self._gen_vectors(word_list1, word_list2)
        
        # print vector1, vector2
        
        vector3 = [vector1[x]*vector2[x]  for x in xrange(len(vector1))]
        vector4 = [1 for num in vector3 if num > 0.]
        co_occur_num = sum(vector4)
        
        # print co_occur_num
        
        if co_occur_num == 0.:
            return 0.
        
        denominator = math.log(float(len(word_list1))) + math.log(float(len(word_list2))) # 分母
        
        if denominator == 0.:
            return 0.
        
        return co_occur_num / denominator
        
        
    def _gen_vectors(self, word_list1, word_list2):
        '''
        两个句子转换成两个同样大小向量。可以通过这两个向量来计算两个句子的相似度。
        word_list1, word_list2: 分别代表两个句子,都是由单词组成的列表
        '''
        words = list(set(word_list1 + word_list2))        
        vector1 = [float(word_list1.count(word)) for word in words]
        vector2 = [float(word_list2.count(word)) for word in words]
        return vector1, vector2
            
    def get_key_sentences(self, num = 6, sentence_min_len = 6):
        '''
        获取最重要的num个长度大于等于sentence_min_len的句子用来生成摘要。
        返回列表。
        '''
        result = []
        result2 = []
        counts = 0
        for count in xrange(len(self.key_sentences)):
            sentence = self.key_sentences[count] 
            if self.type==1:
                weight = self.key_weight[count]
            if counts >= num:
                break
            if len(sentence) >= sentence_min_len:
                result.append(sentence)
                if self.type==1:
                    result2.append(weight)
                counts += 1
        return result,result2
Esempio n. 40
0
    def i_classifier(self): 
        """
        Interface function to launch decision tree classification with a input segmentation :func:`Segmentation.Segmentation`.
        
        This function store optimal threshold by class **Segmentation.out_threshold**. Then compute zonal statistics by polygons
        for every images in multi-processing (if **mp** = 1).
        """ 
        
        # Multiprocessing
        mgr = BaseManager()
        mgr.register('defaultdict', defaultdict, DictProxy)
        mgr.start()
        multi_process_var = [] # Multi processing variable
          
        # Extract final cartography
        out_carto = Segmentation(self.path_segm, self.path_area) 
        out_carto.output_file = self.output_name_moba
        out_carto.out_class_name = self.in_class_name
        out_carto.out_threshold = []
        for ind_th in range(len(self.sample_name)):
            out_carto.out_threshold.append(self.decis[ind_th].threshold[0])
            if '>' in self.decis[ind_th].threshold[0]:
                out_carto.out_threshold.append(self.decis[ind_th].threshold[0].replace('>', '<='))
            elif '<' in self.decis[ind_th].threshold[0]:
                out_carto.out_threshold.append(self.decis[ind_th].threshold[0].replace('<', '>='))
        #     out_carto.zonal_stats((raster_path[ind_th], list_band_outraster[ind_th]))
            multi_process_var.append([self.raster_path[ind_th], self.list_band_outraster[ind_th]])
         
        # Compute zonal stats on slope raster
        multi_process_var.append([self.raster_path[ind_th+1], self.list_band_outraster[ind_th+1]])
        out_carto.out_threshold.append('<'+str(self.slope_degree)) # To agriculture
        out_carto.out_threshold.append('>='+str(self.slope_degree)) # To scree
        if self.path_mnt != '':
            # Add class indexes
            self.tree_direction[0].append(6)
            self.tree_direction[0].append(7)
            
        # Compute zonal stats on Max NDVI raster  
        try:  
            # out_carto.zonal_stats((raster_path[ind_th+1], list_band_outraster[ind_th+1]))
            multi_process_var.append([self.raster_path[ind_th+2], self.list_band_outraster[ind_th+2]])
            # Compute stats twice, because there is 3 classes and not 2
            # out_carto.zonal_stats((raster_path[ind_th+1], list_band_outraster[ind_th+1]))
            multi_process_var.append([self.raster_path[ind_th+2], self.list_band_outraster[ind_th+2]])
        except:
            print('Not MNT on the 3rd step')
            multi_process_var.append([self.raster_path[ind_th+1], self.list_band_outraster[ind_th+1]])
            multi_process_var.append([self.raster_path[ind_th+1], self.list_band_outraster[ind_th+1]])

        # Compute zonal stats with multi processing
        out_carto.stats_dict = mgr.defaultdict(list)
        p = []
        kwargs = {}
        for i in range(len(multi_process_var)):
            kwargs['rank'] = i
            kwargs['nb_img'] = len(multi_process_var)
            p.append(Process(target=out_carto.zonal_stats, args=(multi_process_var[i], ), kwargs=kwargs))
            p[i].start()
            
            if self.mp == 0:
                p[i].join()
        
        if self.mp == 1:       
            for i in range(len(multi_process_var)):
                p[i].join()

        # If there is more one fieldnames line edit fulled in classification tab
        if len(self.sample_name) > 2:
            # Compute the biomass and density distribution
            out_carto.compute_biomass_density()
            
        out_carto.class_tab_final = defaultdict(list)
        self.i_tree_direction()
        out_carto.decision_tree(self.tree_direction)
        
        # If there is more one fieldnames line edit fulled in classification tab
        if len(self.sample_name) > 2:     
            # Compute biomass and density scale
            out_carto.append_scale(self.in_class_name[2], 'self.stats_dict[ind_stats][3]/self.max_bio')
            out_carto.append_scale(self.in_class_name[3], 'self.stats_dict[ind_stats][2]/self.max_wood_idm')
          
        # Final cartography
        out_carto.create_cartography(self.out_fieldname_carto, self.out_fieldtype_carto)