Beispiel #1
0
def genSegmentTermList(termList):
    termList = uniqueList([removeUselessSpace(term)
                           for term in termList])  # 去除无用空白符

    temp = []  # 提取括号内容
    for term in tqdm(termList):
        newTerm, bracketTerm = removeBracket(term)
        temp.append(newTerm)
        temp.extend(bracketTerm)
    termList = uniqueList(temp)
    print('size={}'.format(len(termList)))

    termList = [removeBeginEnd(term) for term in tqdm(termList)]  # 去除首尾无效字符

    temp = []  # 根据标点拆分词条, 取第一段
    for term in tqdm(termList):
        temp.append(re.split('[,,]', term, maxsplit=1)[0])
    termList = uniqueList(temp)
    print('size={}'.format(len(termList)))

    termList = [term for term in termList if not containPunc(term)]  # 去掉包含标点的词
    termList = [term for term in termList if not allDigit(term)]  # 去掉纯数字
    termList = [
        term for term in termList if not containUselessDigitTerm(term)
    ]  # 去掉包含时间词、温度词的词

    termList = removeStopwords(termList)  # 去除停用词
    termList = uniqueList(termList)  # 去重
    print('size={}'.format(len(termList)))
    return sortedByLength(termList)
Beispiel #2
0
def genSegmentTermList(termList):
    termList = uniqueList([removeUselessSpace(term)
                           for term in termList])  # 去除无用空白符

    termList = [removeBeginEnd(term) for term in tqdm(termList)]  # 去除首尾无效字符
    termList = [term for term in termList
                if not isDescription(term)]  # 去掉叙述性的语句

    temp = []  # 根据标点拆分词条
    for term in tqdm(termList):
        subList = [subword.strip() for subword in re.split('[,,]', term)]
        temp.extend(subList)
    termList = uniqueList(temp)
    print('size={}'.format(len(termList)))

    termList = [term for term in termList if not containPunc(term)]  # 去掉包含标点的词
    termList = [term for term in termList if not allDigit(term)]  # 去掉纯数字
    termList = [
        term for term in termList if not containUselessDigitTerm(term)
    ]  # 去掉包含时间词、温度词的词

    termList = [
        term for term in termList
        if len(term) > 1 and len(term) < 20 and containCNS(term)
    ]  # 长度大于1 且 长度小于20 且 包含中文
    termList = [term for term in termList if re.match('^见[\w]+', term) is None]

    termList = removeStopwords(termList)  # 去除停用词
    termList = uniqueList(termList)  # 去重
    print('size={}'.format(len(termList)))
    return sortedByLength(termList)
Beispiel #3
0
 def baikeScript():
     termList = uniqueList(
         open(folder + os.sep + 'umls_wikipedia.txt').read().splitlines() +
         open(folder + os.sep + 'umls_baidubaike.txt').read().splitlines())
     termList = sortedByLength(termList)
     print('size={}'.format(len(termList)))
     print('\n'.join(termList),
           file=open(folder + os.sep + 'umls_baike.txt', 'w'))
Beispiel #4
0
def genSegmentTermList(termList):
    termList = uniqueList([removeUselessSpace(term)
                           for term in termList])  # 去除无用空白符
    print('size={}'.format(len(termList)))
    termList = [
        term for term in tqdm(termList) if len(term) > 1 and containCNS(term)
    ]  # 初筛: 长度大于1 且 包含中文
    print('size={}'.format(len(termList)))

    temp = []  # 提取括号内容
    for term in tqdm(termList):
        newTerm, bracketTerm = removeBracket(term)
        temp.append(newTerm)
        temp.extend(bracketTerm)
    termList = uniqueList(temp)
    print('size={}'.format(len(termList)))

    termList = [removeBeginEnd(term) for term in tqdm(termList)]  # 去除首尾无效字符
    print('size={}'.format(len(termList)))

    temp = []  # 根据标点拆分词条
    for term in tqdm(termList):
        subList = re.split('[,;;]', term)  # 按标点切分(考查了中英符号: 逗号,句号,冒号,分号)
        subList = [subword.strip() for subword in subList]
        temp.extend(subList)
    termList = uniqueList(temp)
    print('size={}'.format(len(termList)))

    termList = [term for term in tqdm(termList) if not allDigit(term)]  # 去掉纯数字
    termList = [
        term for term in tqdm(termList) if not containUselessDigitTerm(term)
    ]  # 去掉包含时间词、温度词的词

    termList = [removeBeginEnd(term) for term in tqdm(termList)]  # 去除首尾无效字符
    print('size={}'.format(len(termList)))
    termList = [
        term for term in tqdm(termList)
        if len(term) > 1 and len(term) < 20 and allCNS(term)
    ]  # 终筛: 长度大于1 且 长度小于20 且 全中文
    print('size={}'.format(len(termList)))

    termList = removeStopwords(termList)  # 去除停用词
    termList = uniqueList(termList)  # 去重
    print('size={}'.format(len(termList)))
    return sortedByLength(termList)  # 按长度排序
Beispiel #5
0
def genSegmentTermList(termList):
    termList = uniqueList([removeUselessSpace(term)
                           for term in termList])  # 去除无用空白符

    termList = [removeBeginEnd(term) for term in tqdm(termList)]  # 去除首尾无效字符
    termList = [term for term in termList if not allDigit(term)]  # 去掉纯数字
    termList = [
        term for term in termList if not containUselessDigitTerm(term)
    ]  # 去掉包含时间词、温度词的词
    termList = [
        term for term in termList
        if len(term) > 1 and len(term) < 20 and allCNS(term)
    ]  # 终筛: 长度大于1 且 长度小于20 且 仅包含中文

    termList = removeStopwords(termList)  # 去除停用词
    termList = uniqueList(termList)  # 去重
    print('size={}'.format(len(termList)))
    return sortedByLength(termList)
Beispiel #6
0
def getICIBATermList():
    """取金山词霸
	"""
    termList = []
    gradedMT = pickle.load(
        open(DATA_PATH + os.sep + 'umlsMT' + os.sep + 'GradedMT.pkl', 'rb'))
    for AUI, infoDict in tqdm(gradedMT.items()):
        if 'ICIBA' in infoDict['source']:
            termList.append(infoDict['source']['ICIBA'])
    termList = uniqueList(termList)
    return termList
Beispiel #7
0
def genSegmentTermList(termList):
	termList = uniqueList([removeUselessSpace(term) for term in termList]) # 去除无用空白符

	termList = [removeBeginEnd(term) for term in tqdm(termList)] # 去除首尾无效字符

	temp = []   # 根据标点拆分词条
	for term in tqdm(termList):
		subList = [subword.strip() for subword in re.split('[,,]', term)]
		temp.extend(subList)
	termList = uniqueList(temp)
	print('size={}'.format(len(termList)))

	termList = [term for term in termList if not containPunc(term)] # 去掉包含标点的词
	termList = [term for term in termList if not allDigit(term)]   # 去掉纯数字
	termList = [term for term in termList if not containUselessDigitTerm(term)] # 去掉包含时间词、温度词的词
	termList = [term for term in termList if not (len(term) == 1 and not isCNS(term))]  # 去掉非汉字单字

	termList = removeStopwords(termList)    # 去除停用词
	termList = uniqueList(termList)
	print('size={}'.format(len(termList)))
	return sortedByLength(termList)
Beispiel #8
0
 def script2():
     from segment.umls import baiduBaikeFilter, wikiPediaFilter
     termList = open(folder + os.sep +
                     'umls_bgequal.txt').read().splitlines()
     baiduTermList = baiduBaikeFilter(
         termList,
         open(folder + os.sep + 'umls_baidubaike.txt').read().splitlines())
     wikiTermList = wikiPediaFilter(termList)
     termList = sortedByLength(uniqueList(baiduTermList + wikiTermList))
     print('size: {}'.format(len(termList)))
     print('\n'.join(termList),
           file=open(folder + os.sep + 'umls_bgequal_baike.txt', 'w'))
Beispiel #9
0
def genSnomedctSegment(termList):
    termList = uniqueList([removeUselessSpace(term)
                           for term in termList])  # 去除无用空白符
    temp = []  # 提取括号内容
    for term in tqdm(termList):
        newTerm, bracketTerm = removeBracket(term)
        temp.append(newTerm)
        temp.extend(bracketTerm)
    termList = uniqueList(temp)
    print('size={}'.format(len(termList)))

    termList = [removeBeginEnd(term) for term in tqdm(termList)]  # 去除首尾无效字符
    print('size={}'.format(len(termList)))

    temp = []  # 根据标点拆分词条
    for term in tqdm(termList):
        subList = [subword.strip() for subword in re.split('[,,::]', term)]
        temp.extend(subList)
    termList = uniqueList(temp)
    print('size={}'.format(len(termList)))

    termList = [term for term in termList if not containPunc(term)]  # 去掉包含标点的词
    termList = [term for term in termList if not allDigit(term)]  # 去掉纯数字
    termList = [
        term for term in termList if not containUselessDigitTerm(term)
    ]  # 去掉包含时间词、温度词的词
    print('size={}'.format(len(termList)))

    termList = [
        term for term in termList if len(term) > 1 and containCNS(term)
    ]  # 长度大于1 且 包含中文
    print('size={}'.format(len(termList)))

    termList = removeStopwords(termList)  # 去除停用词
    termList = uniqueList(termList)  # 去重
    print('size={}'.format(len(termList)))
    return sortedByLength(termList)
Beispiel #10
0
def genBGUnorderEqual():
    """取百度翻译和谷歌翻译无序相等
	"""
    gradedMT = pickle.load(
        open(DATA_PATH + os.sep + 'umlsMT' + os.sep + 'GradedMT.pkl', 'rb'))
    termList = []
    analyzer = StandardAnalyzer()
    for AUI, infoDict in tqdm(gradedMT.items()):
        if infoDict['confidence'] > 5:
            continue
        baidu = infoDict['source'].get('baidu', '')
        google = infoDict['source'].get('google', '')
        if set(analyzer.split(baidu)) == set(
                analyzer.split(google)):  # 百度翻译与谷歌翻译无序相等
            termList.append(baidu)
            termList.append(google)
    termList = uniqueList(termList)
    return termList