Ejemplo n.º 1
0
 def get_annotated_comments(self):
     dct = {}
     for comment_key in self.comments.keys():
         comments_val = self.comments[comment_key]
         tokens_comments = [TokenHandler().word_tokenizer(comment[1]) for comment in comments_val]
         if comment_key not in dct.keys():
             dct[comment_key] = []
         for tokens_comment in tokens_comments:
             result = ''
             ctr_token = 0
             for token in tokens_comment:
                 if token in self.kms.keys():
                     result += "{}/{} ".format(token, self.kms[token])
                 else:
                     if self.normalised:
                         prev_token = tokens_comment[ctr_token-1] if ctr_token > 0 else "<s>"
                         next_token = token
                         normalised_token = EDBNormaliser((prev_token, next_token), self.kms, self.cpd_pickled)\
                             .normalise_token()
                         if normalised_token is None:
                             result += "{}/{} ".format(next_token, 'UNK')
                         else:
                             result += self.get_token_polarity(normalised_token)
                     else:
                         result += "{}/{} ".format(token, 'UNK')
                 ctr_token += 1
             dct[comment_key].append(result)
             print(result)
         # result = [["{}/{}".format(w, (self.kms[w] if w in self.kms.keys() else 'UNK')) for w
         #            in tokens_comment] for tokens_comment in tokens_comments]
     return self.comments, dct
Ejemplo n.º 2
0
def test_handleStartTagTokenHandlesFirstStartTag():
    testToken = StartTagToken("html")
    th = TokenHandler("TestRootUrl/")
    assert len(th.elementTreeRoot.children) == 0
    th.handleStartTagToken(testToken)
    assert len(th.elementTreeRoot.children) == 1
    assert th.elementTreeRoot.children[0].name == "html"
Ejemplo n.º 3
0
    def parseHTML(self, htmlString):
        """
            parseHTML converts the raw htmlString into a list of tokens.
        """
        tsm = TokenizerStateMachine()
        tokenHandler = TokenHandler()
        tokenHandler.rootUrl = self.rootUrl
        i = 0
        while (i < len(htmlString)):
            i = i + tsm.handleCharacter(htmlString[i])
            if (not (tsm.currentEmittedToken == None)):
                if (isinstance(tsm.currentEmittedToken, StartTagToken)
                        and tsm.currentEmittedToken.name == "link"):
                    self.handleLinkToken(tsm.currentEmittedToken)
                else:
                    tokenHandler.processToken(tsm.currentEmittedToken)
                tsm.currentEmittedToken = None

        #self.extractParagraphText(tsm.tokens)
        self.strList.clear()
        self.renderList.clear()
        self.renderObjects.clear()
        #tokenHandler.getRenderList(tokenHandler.elementTreeRoot, self.renderObjects)
        #for r in self.renderObjects:
        #    print(r.text)
        #    if (not(r.fontSize == None)):
        #        print(r.fontSize)
        #print(tokenHandler.elementTreeRoot.getElementRepresentationString(""))
        #tokenHandler.getTextElements(tokenHandler.elementTreeRoot, self.strList)
        self.fillRenderList(tokenHandler.elementTreeRoot, self.renderList)
        for s in self.renderList:
            print(s)
Ejemplo n.º 4
0
    def convertTokenListToHTMLElementTree(self, url, tokenList):
        # The root url is needed by the TokenHandler for handling
        # possible relative links
        rootUrl = self.extractRootUrl(url)
        print(rootUrl)
        tokenHandler = TokenHandler(rootUrl)
        for token in tokenList:
            tokenHandler.processToken(token)

        #print(tokenHandler.elementTreeRoot.getElementRepresentationString(""))
        return tokenHandler.elementTreeRoot
Ejemplo n.º 5
0
def test_getAbsoluteUrl():
    rootUrl = "TestRootUrl/"
    th = TokenHandler(rootUrl)
    testUrls = [
        "http://google.com",
        "https://i.ytimg.com/vi/nrIDL7h9MFQ/hqdefault.jpg?sqp=-oaymwEYCNIBEHZIVfKriqkDCwgBFQAAiEIYAXAB&amp;rs=AOn4CLByrwt1ptJWI5zGkLOZhJpyrFeCSw",
        "ARelativeUrl", "/RelativeWithSlash"
    ]
    assert th.getAbsoluteUrl(testUrls[0]) == "http://google.com"
    assert th.getAbsoluteUrl(
        testUrls[1]
    ) == "https://i.ytimg.com/vi/nrIDL7h9MFQ/hqdefault.jpg?sqp=-oaymwEYCNIBEHZIVfKriqkDCwgBFQAAiEIYAXAB&amp;rs=AOn4CLByrwt1ptJWI5zGkLOZhJpyrFeCSw"
    assert th.getAbsoluteUrl(testUrls[2]) == rootUrl + "ARelativeUrl"
    assert th.getAbsoluteUrl(testUrls[3]) == rootUrl + "RelativeWithSlash"
Ejemplo n.º 6
0
def test_processTokenHandlesClosingTags():
    testTokens = [
        StartTagToken("p"),
        EndTagToken("p"),
        StartTagToken("a"),
        EndTagToken("a")
    ]

    th = TokenHandler("TestRootUrl/")
    for token in testTokens:
        th.processToken(token)

    assert th.elementTreeRoot.name == "#root#"
    assert len(th.elementTreeRoot.children) == 2
    assert th.elementTreeRoot.children[0].name == "p"
    assert th.elementTreeRoot.children[1].name == "a"
Ejemplo n.º 7
0
    def __init__(self) -> None:
        '''
Initializes the program and sets class variables that are going to be used as the initial values across the program.\n
Required command line arguements:\n
GitHub URL (https://github.com/{Username}/{Repository})\n
Optional command line arguements:\n
GitHub Personal Access Token
        '''
        self.args = sys.argv[1:]  # All of the command line args excluding the filename
        self.githubURL = None
        self.githubUser = None
        self.githubRepo = None
        self.githubToken = None
        self.githubTokenList = None # This is pulled from keys.txt
        self.dbCursor = None  # Database specific variable
        self.dbConnection = None  # Database specific variables
        self.th = TokenHandler()    # Class instance to write and read tokens to tokens.txt
Ejemplo n.º 8
0
def test_processTokenProcessesIndependentTags():
    testTokens = [
        StartTagToken("br"),
        StartTagToken("img"),
        StartTagToken("html"),
        EndTagToken("html")
    ]
    testTokens[1].isSelfClosing = True

    th = TokenHandler("TestRootUrl/")
    for token in testTokens:
        th.processToken(token)

    assert len(th.elementTreeRoot.children) == 3
    assert th.elementTreeRoot.children[0].name == "br"
    assert th.elementTreeRoot.children[1].name == "img"
    assert th.elementTreeRoot.children[2].name == "html"
Ejemplo n.º 9
0
def test_processTokenHandlesUnacceptableTags():
    testTokens = [
        StartTagToken("p"),
        StartTagToken("NoHTMLTagShouldEverHaveThisName"),
        StartTagToken("body"),
        StartTagToken("a"),
        EndTagToken("a"),
        EndTagToken("body"),
        EndTagToken("NoHTMLTagShouldEverHaveThisName"),
        EndTagToken("p")
    ]

    th = TokenHandler("TestRootUrl/")
    for token in testTokens:
        th.processToken(token)

    assert len(th.elementTreeRoot.children) == 1
    assert th.elementTreeRoot.children[0].name == "p"
Ejemplo n.º 10
0
    def get_political_news_comment(self):
        conn = self.create_connection()
        with conn:
            cur = conn.cursor()
            sql = "select comment_id, comment from t_comments where comment_id " \
                  "in (select news_id from t_news where news_category = nc2 " \
                  "and news_category = 1)"
            cur.execute(sql)
            data = cur.fetchall()
            cur.close()

            comments = {}
            ctr_cmmnt = 0
            for content in data:
                comment_id = content[0].strip()
                comment = ' '.join(TokenHandler().word_tokenizer(
                    content[1].strip().lower()))
                if comment_id not in comments.keys():
                    comments[comment_id] = []
                    comments[comment_id].append((ctr_cmmnt, comment))
                else:
                    comments[comment_id].append((ctr_cmmnt, comment))
                ctr_cmmnt += 1
            return comments
Ejemplo n.º 11
0
    resulting_file = input('File name as an output: ')

    # Read configuration
    configs = {}
    f = open('System.conf', 'r')
    for line in f.readlines():
        cfg = line.strip().split('=')
        configs[cfg[0].strip()] = cfg[1].strip()

    # Get political news comments
    comments = NewsHandler(configs['dbnews']).get_political_news_comment()

    # Handle bigram model first to ease next computation
    f = open('political_comment_corpus.txt', 'r')
    corpus = TokenHandler().word_tokenizer(f.readline())
    lm = bigrams(corpus)

    # MLE only -> change to Lidstone smoothing
    # cfd_pickled = pickle.dumps(nltk.ConditionalFreqDist(lm))
    # cpd = ConditionalProbDist(pickle.loads(cfd_pickled), MLEProbDist)
    # cpd_pickled = pickle.dumps(cpd)

    # Lidstone smoothing
    cfd_pickled = pickle.dumps(nltk.ConditionalFreqDist(lm))
    lidstone_estimator = lambda fd: LidstoneProbDist(fd, 0.01, fd.B() + 1)
    cpd = ConditionalProbDist(pickle.loads(cfd_pickled), lidstone_estimator)

    # Annotate each comment
    before_annotated, after_annotated = PolarityAnnotator(comments, configs['dbkbbi_cleaned_offline'], cpd, normalised=isNormalised)\
        .get_annotated_comments()
Ejemplo n.º 12
0
            f.write(grammar.grammar.start_symbol + "\n")
            f.write(view)

        directive_handler.code_generator.print_code()

    @staticmethod
    def empty_files(file_out, file_error):
        with(open(file=file_out, mode="w")):
            pass
        with(open(file=file_error, mode="w")):
            pass


DEFAULT_FILE_IN_NAME = "scanner.txt"
DEFAULT_FILE_OUT_NAME = "parsetree.txt"
DEFAULT_FILE_ERROR_NAME = "error.txt"

c_lexical_dfa = CLexicalDFA.make_c_lexical_dfa()
not_printing_tokens = [CTokenType.WHITE_SPACE, CTokenType.COMMENT]
c_token_handler = TokenHandler(c_lexical_dfa, not_printing_tokens)
grammar = LL1Grammar(Grammar.make_grammar(compressed_grammar))
# for prod in grammar.grammar.prods:
#     print("{}->{}".format(grammar.grammar.prods[prod].non_terminal, grammar.grammar.prods[prod].rhses))
#
# for f in grammar.first_sets["simple-expression"]:
#     print(str(f))
# print(grammar.grammar.compress())
parser = Parser(grammar)
parse_handler = ParserHandler(parser)
compiler = Compiler(c_token_handler, parse_handler)
compiler.compile(DEFAULT_FILE_IN_NAME, DEFAULT_FILE_OUT_NAME, DEFAULT_FILE_ERROR_NAME)
Ejemplo n.º 13
0
def test_isRelativeUrl():
    th = TokenHandler("TestRootUrl/")
    testUrls = ["http://google.com", "ARelativeUrl", "/RelativeWithSlash"]
    assert th.isRelativeUrl(testUrls[0]) == False
    assert th.isRelativeUrl(testUrls[1]) == True
    assert th.isRelativeUrl(testUrls[2]) == True