Esempio n. 1
0
def OrganizeLex(lexiconLocation, _CommentDict, _LexiconDict):
    with open(lexiconLocation, encoding='utf-8') as dictionary:
        oldWord = "firstCommentLine"
        for line in dictionary:
            if line.startswith("//"):
                if _CommentDict.get(oldWord):
                    _CommentDict.update({oldWord:_CommentDict.get(oldWord)+line})
                else:
                    _CommentDict.update({oldWord: line})
                continue
            code, comment = utils.SeparateComment(line)
            blocks = [x.strip() for x in re.split(":", code) if x]
            if len(blocks) != 2:
                continue
            newNode = False

            node = SearchLexicon(blocks[0], 'origin')
            # node = None
            if not node:
                newNode = True
                node = LexiconNode(blocks[0])
                if "_" in node.text:
                    node.forLookup = True       #for those combination words.
                if comment:
                    node.comment = comment
            # else:
            #     logging.debug("This word is repeated in lexicon: %s" % blocks[0])
            features, node = SplitFeaturesWithSemicolon(blocks[1], node)
            for feature in features:
                if re.match('^\'.*\'$', feature):
                    node.norm = feature.strip('\'')
                elif re.match('^/.*/$', feature):
                    node.atom = feature.strip('/')

                elif re.search(u'[\u4e00-\u9fff]', feature):
                    node.norm = feature
                    continue
                else:
                    featureID = GetFeatureID(feature)

                    if featureID == -1:
                        logging.info("Missing Feature: " + feature)
                        if not feature.startswith("\\"):
                            node.missingfeature += "\\" + feature
                        else:
                            node.missingfeature = feature

                    node.features.add(featureID)
                    ontologynode = SearchFeatureOntology(featureID)
                    if ontologynode:
                        ancestors = ontologynode.ancestors
                        if ancestors:
                            node.features.update(ancestors)

            if newNode:
                _LexiconDict.update({node.text: node})
                # logging.debug(node.word)
            oldWord = blocks[0]

    logging.info("Finish loading lexicon" + lexiconLocation)
Esempio n. 2
0
def LoadLexiconFilterlist(BlacklistLocation):
    if BlacklistLocation.startswith("."):
        BlacklistLocation = os.path.join(os.path.dirname(os.path.realpath(__file__)),  BlacklistLocation)
    with open(BlacklistLocation, encoding="utf-8") as dictionary:
            for lined in dictionary:
                word, _ = utils.SeparateComment(lined)
                if  word:
                    _LexiconFilterSet.add(word)
Esempio n. 3
0
def LoadAppendixList(featureOncologyLocation):
    Folder = os.path.dirname(featureOncologyLocation)
    NoShowFileLocation = os.path.join(Folder, "featureNotShow.txt")
    with open(NoShowFileLocation, encoding="utf-8") as dictionary:
        for line in dictionary:
            word, _ = utils.SeparateComment(line)
            if not word:
                continue
            NotShowList.append(GetFeatureID(word))

    NoCopyFileLocation = os.path.join(Folder, "featureNotCopy.Parser.txt")
    with open(NoCopyFileLocation, encoding="utf-8") as dictionary:
        for line in dictionary:
            word, _ = utils.SeparateComment(line)
            if not word:
                continue
            NotCopyList.append(GetFeatureID(word))
Esempio n. 4
0
def LoadTopCharacters(FileLocation):
    Top500 = ""
    with open(FileLocation, encoding="utf-8") as dictionary:
        for lined in dictionary:
            characters, _ = utils.SeparateComment(lined)
            if not characters:
                continue
            Top500 += characters
    return Top500[:100], Top500
Esempio n. 5
0
def AlignMain():
    newloc = "outputMain.txt"
    with open(newloc, 'w',encoding='utf-8') as file:
        with open(paraMain, encoding='utf-8') as dictionary:
            for line in dictionary:
                if line.startswith("//"):
                    file.write(line)
                    continue
                code, comment = utils.SeparateComment(line)
                if (code not in _LexiconDictB.keys()) and (code not in _LexiconDictP.keys()) and (code not in _LexiconDictL.keys()) and (code not in _LexiconDictI.keys()) and (code not in _LexiconDictI4.keys()) and (code not in _LexiconDictLexX.keys()) and (code not in _LexiconDictDefX.keys()):
                    file.write(code + " " + comment + "\n")
    shutil.move(newloc,paraMain)
Esempio n. 6
0
def LoadFeatureSet(featureOncologyLocation):
    global _FeatureList, _FeatureDict, _FeatureSet
    _FeatureSet.clear()

    with open(featureOncologyLocation, encoding="utf-8") as dictionary:
        for line in dictionary:
            code, __ = utils.SeparateComment(line)
            features = [x.strip() for x in re.split("[,;=\s]", code) if x]

            for feature in features:
                if re.match('^\'.*\'$', feature) or re.match(
                        '^/.*/$', feature):
                    continue
                _FeatureSet.add(feature)
    _FeatureList = list(sorted(_FeatureSet))
    _FeatureDict = {f: ID for ID, f in enumerate(_FeatureList)}
Esempio n. 7
0
def LoadLexiconBlacklist(BlacklistLocation):
    if BlacklistLocation.startswith("."):
        BlacklistLocation = os.path.join(os.path.dirname(os.path.realpath(__file__)),  BlacklistLocation)
    with open(BlacklistLocation, encoding="utf-8") as dictionary:
            for lined in dictionary:
                content, _ = utils.SeparateComment(lined)
                if not content:
                    continue
                if " " in content or "\t" in content:
                    spaceindex = content.find(" ")
                    if spaceindex < 0:
                        spaceindex = content.find("\t")
                    _word = content[:spaceindex] + "$"
                    _freq = int(content[spaceindex+1:])
                else:
                    _word = content[0] + "$"
                    _freq = Freq_Basic_Blacklist
                _Blacklist_Freq[_word] = _freq
Esempio n. 8
0
    def SetAncestors(self, line):
        code, comment = utils.SeparateComment(line)
        self.Comment = comment
        code = self.ProcessAliasInFeatureFile(code)
        if len(code) == 0:
            return

        features = [x.strip() for x in re.split("[,; ]", code) if x]
        openWord = features[0]
        openWordID = GetFeatureID(openWord)

        TryOldNode = SearchFeatureOntology(openWordID)
        if TryOldNode:
            if len(features) > 1:
                for feature in features[1:]:
                    TryOldNode.ancestors.add(GetFeatureID(feature))
        else:
            self.openWord = openWord
            self.openWordID = openWordID

            if len(features) > 1:
                for feature in features[1:]:
                    fid = GetFeatureID(feature)
                    self.ancestors.add(fid)
Esempio n. 9
0
    level = logging.INFO

    for handler in logging.root.handlers[:]:
        logging.root.removeHandler(handler)
    logging.basicConfig(level=level,
                        format='%(asctime)s [%(levelname)s] %(message)s')

    UnitTest = {}
    if not os.path.exists(args.inputfile):
        print("Unit Test file " + args.inputfile + " does not exist.")
        exit(0)

    with open(args.inputfile, encoding="utf-8") as RuleFile:
        for line in RuleFile:
            if line.strip():
                Content, _ = utils.SeparateComment(line.strip())
                if Content and '\t' in Content:  # For the testfile that only have test sentence, not rule name
                    TestSentence, Sales = Content.split('\t', 2)
                    UnitTest[TestSentence] = int(float(Sales))

    for Sentence in UnitTest:
        LexicalAnalyzeURL = utils.ParserConfig.get(
            "main",
            "url_larestfulservice") + "/LexicalAnalyze?Type=json&Sentence="
        ret = requests.get(LexicalAnalyzeURL + "\"" + Sentence + "\"")
        root = jsonpickle.decode(ret.text)
        for s in root['sons']:  # ignore the root
            AccumulateNodes(s)

        #AccumulateNodes(root)
Esempio n. 10
0
def OutputFeatureOntologyGraph():
    #output = "//***Ontology***" + "\n"
    if not hasattr(OutputFeatureOntologyGraph, "graph"):
        from collections import defaultdict
        OutputFeatureOntologyGraph.outbound = defaultdict(int)
        OutputFeatureOntologyGraph.inbound = defaultdict(int)
        OutputFeatureOntologyGraph.nodeset = set()

        OutputFeatureOntologyGraph.graph = set()
        PipeLineLocation = utils.ParserConfig.get("main", "Pipelinefile")
        XLocation = os.path.dirname(PipeLineLocation)
        with open(XLocation + '/../Y/feature.txt',
                  encoding="utf-8") as dictionary:
            for line in dictionary:
                code, comment = utils.SeparateComment(line)
                if "," not in code:
                    continue  #no edge. ignore

                OpenWord, ancestors = code.split(",", 1)
                OpenWordID = GetFeatureID(OpenWord.split(
                    "=", 1)[0].strip())  #remove the alias.
                if OpenWordID == -1:
                    logging.warning(
                        "OutputFeatureOntologyGraph: wrong word ID for line {}."
                        .format(code))
                    continue
                for path in ancestors.split(";"):
                    prev = OpenWordID
                    for node in path.split(","):
                        if node.strip():
                            parentid = GetFeatureID(node.strip())
                            if parentid == -1:
                                logging.warning(
                                    "OutputFeatureOntologyGraph: wrong parentid for node {}"
                                    .format(node))
                                continue
                            if (prev, parentid
                                ) not in OutputFeatureOntologyGraph.graph:
                                OutputFeatureOntologyGraph.graph.add(
                                    (prev, parentid))
                                OutputFeatureOntologyGraph.outbound[prev] += 1
                                OutputFeatureOntologyGraph.inbound[
                                    parentid] += 1
                                OutputFeatureOntologyGraph.nodeset.add(prev)
                                OutputFeatureOntologyGraph.nodeset.add(
                                    parentid)

                            prev = GetFeatureID(node.strip())

    output = "{\n"
    for node in sorted(OutputFeatureOntologyGraph.nodeset):
        output += "{} [label=\"{}\" tooltip=\"Inbound:{} Outbound:{} \" ];\n".format(
            node, GetFeatureName(node),
            OutputFeatureOntologyGraph.inbound[node],
            OutputFeatureOntologyGraph.outbound[node])
    for edge in sorted(OutputFeatureOntologyGraph.graph,
                       key=operator.itemgetter(0, 1)):
        #output += GetFeatureName(edge[0]) + "->" + GetFeatureName(edge[1]) + "\n"
        output += "\t{}->{} ;\n".format(edge[0], edge[1])
    output += "}\n"

    logging.info(
        "In Feature ontology, There are {} edges, for {} nodes.".format(
            len(OutputFeatureOntologyGraph.graph),
            len(OutputFeatureOntologyGraph.nodeset)))
    return output