Exemple #1
0
class Summarizer:
    def __init__(self):
        self.parser = Parser()

    '''
    test : it is catchphrase, one sentence.
    title: sentence list.
    '''
    def summarize(self, text, title):
        sentences = text
        result = []

        ## step 1, get term list of catchphrase.
        (catchphrase_keywords, catchphrase_wordCount) = self.parser.getKeywords(title)
        result.append( (catchphrase_keywords, catchphrase_wordCount) )

        catchword_list = [catchphrase_keywords[idx]['word'] for idx in range(len(catchphrase_keywords))]
        #print("[*catchword_list*]",catchword_list)

        ## step 2, get top k word list in sentences.
        ## 2.1 get term list of detail.
        #text_merged = " ".join(sentences)
        #(detail_keywords, detail_wordCount) = self.parser.getKeywords(text_merged)

        for idx in range(len(text)):
            (sentence_keywords, sentence_wordCount) = self.parser.getKeywords(text[idx])
            result.append( (sentence_keywords, sentence_wordCount) )

            word_list = [sentence_keywords[idx]['word'] for idx in range(len(sentence_keywords))]
            #print("\n[*word_list*]", word_list)


        return result
Exemple #2
0
    def __init__(self):

        daemon = Pyro4.Daemon(host=others.get_ip())

        self.client_uri = daemon.register(self)
        print(self.client_uri)

        self.parser = Parser()
        self.main_server = None  # almacena la uri del servidor principal

        self.current_request_id = 0

        self.start_time_current_request = None
        self.current_request_reports = []
        self.expected_replies = 0
        self.id_lock = threading.Lock(
        )  # para cambiar current_request_id y expected_replies
        self.list_lock = threading.Lock(
        )  # para agregar y eliminar cosas de current_request_reports

        threading.Thread(target=daemon.requestLoop).start()
Exemple #3
0
def make_topic(ref, title, desc):
    """
    Добавляет новую тему в таблицу
    :param ref: ссылка
    :param title: название
    :param desc: описание
    """
    print('new topic')
    all_topic_text = ''
    topic_words_len = defaultdict(int)
    topic_words_freq = defaultdict(int)
    articles = Parser(ref)
    times_articles = articles.get_time()
    a_titles, a_description, a_refs = articles.get_titles()
    for j in range(len(a_titles)):
        print('new article')
        article_words_len = defaultdict(int)
        article_words_freq = defaultdict(int)
        article = Parser(a_refs[j])
        all_article_text = article.get_paragraphs()
        all_topic_text += ' ' + all_article_text
        fill_words(all_article_text.split(), article_words_freq, article_words_len)
        new_article = Article(topic=title, name=a_titles[j],
                              href=a_refs[j],
                              text=article.get_paragraphs(),
                              upd=dateparser.parse(times_articles[j].text),
                              stat_words_len=json.dumps(article_words_len),
                              stat_words_freq=json.dumps(article_words_freq))
        new_article.save()
        make_tags(article.get_tags(), a_titles[j])
    fill_words(all_topic_text.split(), topic_words_freq, topic_words_len)
    new_topic = Topic(name=title, description=desc, href=ref,
                      upd=dateparser.parse(times_articles[0].text),
                      stat_words_len=json.dumps(topic_words_len),
                      stat_words_freq=json.dumps(topic_words_freq))
    new_topic.save()
Exemple #4
0
@author: xuzairong
"""
from urlsMan import StockListUrlMan
from config import stockListUrl, stockUrl
from downLoader import Downloader
from myParser import Parser
from mongoDbMan import MongoMan

if __name__ == "__main__":
    #url管理器
    urlsManObject = StockListUrlMan()
    stockListUrls = urlsManObject.getStockListUrl(stockListUrl)
    #下载器
    downloaderObject = Downloader()
    #解析器
    parserObject = Parser()
    #mongo管理器
    mongoManObject = MongoMan()
    #结果
    result = []
    count = 0
    for url in stockListUrls:
        print(url)
        try:
            driver = downloaderObject.getStockInfo(url)
            json = parserObject.parseCoreData(driver)
            json["url"] = url
            result.append(json)
            count = count + 1
            print(count)
            if count % 10 == 0:
Exemple #5
0
        all_topic_text += ' ' + article.text
    fill_words(all_topic_text.split(), topic_words_freq, topic_words_len)
    return json.dumps(topic_words_len), json.dumps(topic_words_freq)


while True:
    try:
        db.close()
        db.connect()
        for index in range(len(titles)):
            if len(Topic.select().where(Topic.name == titles[index])) == 0:
                make_topic(refs[index], titles[index], description[index])
            else:
                cur_topic = Topic.get(Topic.name == titles[index])
                last_upd = cur_topic.upd
                articles = Parser(refs[index])
                times_articles = articles.get_time()
                cur_topic.upd = dateparser.parse(times_articles[0].text)
                cur_topic.save()
                a_titles, a_description, a_refs = articles.get_titles()
                have_new = False
                for j in range(len(times_articles)):
                    if dateparser.parse(times_articles[j].text) > last_upd:
                        have_new = True
                        print('new article')
                        article = Parser(a_refs[j])
                        article_words_len = defaultdict(int)
                        article_words_freq = defaultdict(int)
                        all_article_text = article.get_paragraphs()
                        fill_words(all_article_text.split(), article_words_freq, article_words_len)
                        new_article = Article(topic=titles[index], name=a_titles[j],
Exemple #6
0
class Client:
    UPDATE_SERVERS_TIMEOUT = 2
    UPDATE_SERVERS_TIME = 2
    WAIT_ANSWERS_TIME = 60
    WAIT_AND_TRY_AGAIN_TIME = 5
    MAX_ATTEMPT_NUMBER = 2
    FILE_PART_SIZE = 10000000  # 10mb

    DOWNLOAD_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                                 'downloads')

    def __init__(self):

        daemon = Pyro4.Daemon(host=others.get_ip())

        self.client_uri = daemon.register(self)
        print(self.client_uri)

        self.parser = Parser()
        self.main_server = None  # almacena la uri del servidor principal

        self.current_request_id = 0

        self.start_time_current_request = None
        self.current_request_reports = []
        self.expected_replies = 0
        self.id_lock = threading.Lock(
        )  # para cambiar current_request_id y expected_replies
        self.list_lock = threading.Lock(
        )  # para agregar y eliminar cosas de current_request_reports

        threading.Thread(target=daemon.requestLoop).start()

    def call_exec_cmd(self, request_id, server, command, params):
        server = Pyro4.Proxy(server)
        server.exec_cmd(request_id, command.value, params, self.client_uri)

    def create_filename(self, tags, filename):
        s = ''
        for t in tags:
            s += t
            s += '_'

        s += '()'
        s += filename

        return s

    def start_client(self):
        self.update_main_server()

        while True:
            try:
                s = input()
                command, params = self.parser.parse(s)

                if self.main_server is not None:
                    if command is not None:

                        if command == command.cp:
                            # esto es porque el cp debe comprobar que el archivo exista y enviar el size
                            path, tags, filename = params
                            if os.path.exists(path):
                                fd = os.stat(path)
                                size = fd.st_size
                                params = (path, tags, filename, size)
                            else:
                                print(Strings.FILE_NOT_FOUND.format(path))
                                continue

                        with self.id_lock:
                            self.current_request_id = self.current_request_id + 1

                        with self.list_lock:
                            self.current_request_reports = []

                        future = time.time() + self.WAIT_ANSWERS_TIME
                        attemp = 1
                        with self.id_lock:
                            self.expected_replies = 0

                        while attemp <= self.MAX_ATTEMPT_NUMBER:
                            try:
                                with self.id_lock:
                                    self.expected_replies += 1
                                self.call_exec_cmd(self.current_request_id,
                                                   self.main_server, command,
                                                   params)
                                break
                            except:
                                with self.id_lock:
                                    self.expected_replies -= 1
                                print(
                                    Strings.UNREACHEABLE_SERVER_ERROR.format(
                                        self.main_server))
                                self.main_server = None
                                self.update_main_server()
                                attemp += 1

                        while True:
                            if self.expected_replies <= 0 or time.time(
                            ) > future:
                                break

                        with self.id_lock:
                            self.current_request_id = 0
                            self.expected_replies = 0

                        if command == command.ls:
                            if len(self.current_request_reports) > 0:
                                print('Archivos hallados')
                                for f in self.current_request_reports:
                                    print(f)
                            else:
                                print(
                                    'No se encontraron archivos con las caracteristicas definidas'
                                )
                            print()

                        if command == command.info:
                            if len(self.current_request_reports) > 0:
                                if self.current_request_reports[0] is not None:
                                    for f in self.current_request_reports:
                                        print(f)
                                else:
                                    print(
                                        'No se encontro el archivo solicitado')
                            print()

                        if command == command.rm:
                            if len(self.current_request_reports
                                   ) > 0 and self.current_request_reports[
                                       0] is not None:
                                print('Archivos eliminados')
                                for t in self.current_request_reports:
                                    print(t)
                            else:
                                print(
                                    'No se encontraron archivos para eliminar')
                            print()

                        if command == command.get:
                            if len(self.current_request_reports) > 0:
                                if self.current_request_reports[0] is not None:
                                    # significa que se encontro alguien que tuviera el archivo
                                    tags, filename = params
                                    servers = []
                                    for s in self.current_request_reports:
                                        servers.append(s)
                                    params = (tags, filename, servers)
                                    threading.Thread(target=self.client_get,
                                                     args=params).start()
                            else:
                                print('No se encontro el archivo {0}'.format(
                                    params))

                        if command == command.cp:
                            if len(self.current_request_reports) > 0:
                                if self.current_request_reports[0] is not None:
                                    # significa que el servidor principal encontro algun server que recibiera el archivo
                                    path, tags, filename, size = params
                                    params = (
                                        self.current_request_reports[0],
                                        path,
                                        tags,
                                        filename,
                                    )
                                    threading.Thread(target=self.client_cp,
                                                     args=params).start()
                                else:
                                    print('El archivo {0} ya existe'.format(
                                        path))

                    else:
                        print('comando invalido')

                else:
                    print('No se encuentran servidores')
            except:
                continue

    def client_cp(self, server_uri, path, tags, filename, offset=0, attempt=0):
        # copiar el archivo por partes
        completed = False
        while attempt < self.MAX_ATTEMPT_NUMBER:
            if os.path.exists(path):
                try:
                    fd = open(path, 'a+b')
                    fd.seek(offset)
                    content = fd.read(self.FILE_PART_SIZE)
                    fd.close()
                except:
                    print(Strings.FILE_LOST_SUDDENLY.format(path))
                    break

                if len(content) > 0:
                    try:
                        server = Pyro4.Proxy(server_uri)
                        print(
                            'Enviando offset{0}, tags:{1}, filename{2}'.format(
                                offset, tags, filename))
                        server.fill_file(tags, filename, content, offset)
                        offset += len(content)
                    except:
                        time.sleep(self.WAIT_AND_TRY_AGAIN_TIME)
                        print(Strings.TRYING_AGAIN.format('copiar', path))
                        attempt += 1
                        continue

                else:
                    try:
                        server = Pyro4.Proxy(server_uri)
                        server.fill_file(tags, filename, content, -1)
                        print('Se termino de enviar tags:{0}, filename{1}'.
                              format(tags, filename))
                    except:
                        pass
                    print(Strings.SUCCESFUL_OPERATION.format('copiar', path))
                    completed = True
                    break

            else:
                print(Strings.FILE_LOST_SUDDENLY.format(path))

            if completed:
                print(Strings.OPERATION_FAIL.format('copiar ', path))

    def client_get(self, tags, filename, servers):
        # seleccionar server para copiar
        attempt = -1
        completed = False
        offset = 0

        # crear el archivo en la carpeta predefinida
        path = os.path.join(self.DOWNLOAD_PATH, filename)
        if offset == 0:
            try:
                if os.path.exists(path):
                    print(Strings.FILE_ALREADY_EXISTS.format(path))
                    return
                fd = open(path, 'x')
                fd.close()
                print('Creando archivo tags:{0}, filename:{1}'.format(
                    tags, filename))
            except:
                print(Strings.FILE_LOST_SUDDENLY.format(path))
                print(Strings.OPERATION_FAIL.format('get ', (tags, filename)))
                return

        i = 0
        # empezar a descargar el archivo
        while i != len(servers):
            if os.path.exists(path):
                try:
                    server = Pyro4.Proxy(servers[i])
                    correct, content = server.get_part(tags, filename, offset,
                                                       self.FILE_PART_SIZE)
                    print(
                        'Recibiendo offset:{0}, tags:{1}, filename:{2}'.format(
                            offset, tags, filename))
                    if correct:
                        content = b64decode(content['data'])
                        offset += len(content)
                    else:
                        break
                except:
                    time.sleep(self.WAIT_AND_TRY_AGAIN_TIME)
                    print(Strings.TRYING_AGAIN.format('get', path))
                    if attempt < self.MAX_ATTEMPT_NUMBER:
                        attempt += 1
                    else:
                        attempt = 0
                        i += 1
                    continue

                if len(content) > 0:
                    try:
                        size = os.stat(path).st_size
                        if size <= offset:
                            fd = open(path, 'a+b')
                            fd.seek(offset)
                            fd.write(content)
                            fd.close()
                    except:
                        print(
                            Strings.FILE_LOST_SUDDENLY.format(
                                (tags, filename)))
                        break
                else:
                    print(
                        Strings.SUCCESFUL_OPERATION.format(
                            'copiar', (tags, filename)))
                    completed = True
                    break
            else:
                print(Strings.FILE_LOST_SUDDENLY.format(path))
                break

        if completed is False:
            print(Strings.OPERATION_FAIL.format('get ', (tags, filename)))

    def report(self, request_id, command, output):
        if request_id == self.current_request_id and self.expected_replies > 0:
            command = Command[command]

            if command == Command.ls or command == Command.info or command == Command.rm or command == Command.cp or command == Command.get:
                with self.list_lock:
                    if len(output) > 0:
                        self.current_request_reports.extend(output)

            with self.id_lock:
                self.expected_replies -= 1

    def update_main_server(self):
        while self.main_server is None:
            self.scan_loop()
            time.sleep(self.UPDATE_SERVERS_TIME)

    def scan_loop(self):
        scanner = socketutil.createBroadcastSocket()
        scanner.settimeout(self.UPDATE_SERVERS_TIMEOUT)

        main_server = None
        try:
            scanner.sendto(b'get_uri_client', ('255.255.255.255', 1212))
        except:
            print('Error al hacer broadcast')

        while True:
            try:
                data, address = scanner.recvfrom(512)
                main_server = data.decode()
                break
            except:
                break

        self.main_server = main_server
        print('Servidor Principal:{0}'.format(self.main_server))
Exemple #7
0
def predictBayesianModel(sentenceList=[
    'hello world occupation lease', 'machine learning board',
    'machine learning lease occupation'
],
                         input_path="./model/train_model.npz",
                         word_index_file="./model/word_index.npz"):
    #print("\n-----------------------------------------")
    #print("Load model1: ./model/train_model.npz")
    #print("-----------------------------------------")
    #--------------------------------------------------
    npzfile = np.load(input_path)

    pi_bar = npzfile['arr_0']
    theta_bar = npzfile['arr_1']
    #print("\n[[pi]]:")
    #print(pi_bar)
    #print(pi_bar.shape)
    #print("\n[[theta]]:")
    #print(theta_bar)
    #print(theta_bar.shape)

    #print("\n-----------------------------------------")
    #print("Load model2: ./model/word_index.npz")
    #print("-----------------------------------------")
    #--------------------------------------------------
    npzfile2 = np.load(word_index_file)

    catchword_index = npzfile2['arr_0']
    bodyword_index = npzfile2['arr_1']
    #print("\n[[catchword index]]:")
    #print(catchword_index)
    #print(catchword_index.shape)
    #print("\n[[bodyword index]]:")
    #print(bodyword_index)
    #print(bodyword_index.shape)
    #--------------------------------------------------

    scoreRecord = []
    parser = Parser()
    catchword_list = catchword_index.tolist()
    bodyword_list = bodyword_index.tolist()

    #--------------------------------------------------
    # Get catchword_positionList
    #--------------------------------------------------
    catchwords = sentenceList[0]
    (keywords, wordCount) = parser.getKeywords(catchwords)

    catchword_positionList = []
    #print("keywords: ", keywords)
    for elem in keywords:
        word = elem['word']
        count = elem['count']

        idx = catchword_list.index(word) if word in catchword_list else -1
        if (idx != -1):
            #print("appending ", catchword_list[idx])
            catchword_positionList.append(idx)

    #Debug
    #print("catchword_positionList:", catchword_positionList)
    #for catchwordPos in catchword_positionList:
    #print(catchword_list[catchwordPos])

    #--------------------------------------------------
    # Calculate score for each word in body sentence.
    # The first sentence is catchphrases.
    #--------------------------------------------------
    for idx in range(1, len(sentenceList)):
        (keywords, wordCount) = parser.getKeywords(sentenceList[idx])

        sentence_score = 0
        '''
        1) get the position list of catch words in predicted case.
        2) for each word in each sentence, find the scores for each catchword in theta_bar.
        3) add these scores which will be the final for one word in this sentence.
        4) evaluate next word... until the end of this sentence.
        5) goto 2).
        '''

        ## print("----------- sentence --------------")
        for elem in keywords:
            # Jeff: For each word in body sentence.
            word = elem['word']
            count = elem['count']
            ## print("sentence word      :", word)
            ## print("sentence word count:", count)
            ## print(" ")

            word_score = 0

            wordInSentence_idx = bodyword_list.index(
                word) if word in bodyword_list else -1
            if (wordInSentence_idx != -1):

                # Jeff: For each word in catchphrase
                for catchwordIdx in catchword_positionList:
                    ## print("* theta_bar[",idx, "][", catchword_list[catchwordIdx], "]")
                    ## print("* score:", theta_bar[idx][catchwordIdx])
                    ## print(" ")

                    word_score += theta_bar[idx][catchwordIdx]

            sentence_score += word_score * count

        scoreRecord.append(sentence_score)

    # NB: sentence ith, from 1 to end.
    #print("\nScore list for each sentence:")
    #print([ float("%.2f" % elem) for elem in scoreRecord ])
    #print("")

    return scoreRecord
Exemple #8
0
 def __init__(self):
     self.parser = Parser()
Exemple #9
0
filesToCompile = getFilesInDirectory(directory)
osFiles = [
    'Jack_Programs/jack-os/Array.jack', 'Jack_Programs/jack-os/Keyboard.jack',
    'Jack_Programs/jack-os/Math.jack', 'Jack_Programs/jack-os/Memory.jack',
    'Jack_Programs/jack-os/Output.jack', 'Jack_Programs/jack-os/Screen.jack',
    'Jack_Programs/jack-os/String.jack', 'Jack_Programs/jack-os/Sys.jack'
]
filesToCompile = osFiles + filesToCompile
displayMessage("\n\n\nPrograms to Compile")
for file in filesToCompile:
    print(file)
print("\n\n")
time.sleep(0.3)

for file in filesToCompile:
    parser = Parser(file, exitOnErrors)
    symbolTables = parser.parse()

    # update the dictionaries for the symbol tables
    globalClassTable = symbolTables[0]
    globalMethodTable[file.split("/")[-1].split(".")[0]] = symbolTables[1]
    assertions = assertions + symbolTables[2]
    classMethods = symbolTables[3]
    vmCode[file.split("/")[-1].split(".")[0]] = symbolTables[4]

checkAssertions(assertions, globalClassTable, globalMethodTable, classMethods)
vmCode = fixVMCode(vmCode)
printoutTables()
saveFiles()
displayMessage("\n\n\nDone")
Exemple #10
0
import numpy as np
import tensorflow as tf
import random
import sys, os
import json
import argparse
from myParser import Parser
#import parser
from datamanager import DataManager
from actor import ActorNetwork
from LSTM_critic import LSTM_CriticNetwork
tf.logging.set_verbosity(tf.logging.ERROR)
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
#get parse
argv = sys.argv[1:]
parser = Parser().getParser()
args, _ = parser.parse_known_args(argv)
random.seed(args.seed)

#get data
#dataManager = DataManager(args.dataset)
dataManager = DataManager('../AGnews')
train_data, dev_data, test_data = dataManager.getdata(args.grained,
                                                      args.maxlenth)
word_vector = dataManager.get_wordvector(args.word_vector)

if args.fasttest == 1:
    train_data = train_data[:1000]
    dev_data = dev_data[:200]
    test_data = test_data[:200]
Exemple #11
0
def main():
    if noCommandLineArguements():
        print("ERROR: No source file specified")
        exit()

    parser = Parser(SOURCE_FILE_NAME)
    output = Output(SOURCE_FILE_NAME)
    symbolTable = SymbolTable()
    code = Code()

    # first pass
    while parser.hasMoreCommands():
        parser.advance()  # read next command from source file
        if parser.commandType() == "L_COMMAND":  # label
            if not symbolTable.contains(parser.symbol()):
                symbolTable.addEntry(parser.symbol(), parser.getAddress())
    parser.resetFile()  # reset file pointer to first line of source file

    # second pass
    while parser.hasMoreCommands():
        parser.advance()  # read next command from source file
        if parser.commandType() == "A_COMMAND":  # addressing command
            if parser.isConstant():
                output.writeToBin(parser.getConstant())
            else:
                if not symbolTable.contains(parser.symbol()):
                    symbolTable.addEntry(parser.symbol(), "new")
                output.writeToBin(symbolTable.getAddress(parser.symbol()))
        elif parser.commandType() == "C_COMMAND":  # computation command
            output.writeToBin(C_COMMAND_PREFIX + code.comp(parser.comp()) +
                              code.dest(parser.dest()) +
                              code.jump(parser.jump()))
        elif parser.commandType() != "L_COMMAND":  # label
            print("ERROR: Unexpected command")
            exit()

    parser.closeFile()
    output.closeFile()
Exemple #12
0
from collections import defaultdict
import json
import requests
from myParser import Parser
from bd import Topic, Article, Tag, db
import dateparser
import config
session = requests.Session()
session.max_redirects = config.MAX_REDIRECTS
my_site = Parser(config.MY_SITE)
titles, description, refs = my_site.get_titles()
all_titles = set(titles)
db.connect()

def make_tags(tags, title):
    """
    Заполняет таблицу с тегами.
    :param tags: лист тегов
    :param title: статья, откуда мы взяли теги
    """
    for tag in tags:
        new_tag = Tag(name=tag.text, article=title, href=tag['href'])
        new_tag.save()


def fill_words(text, words_freq, words_len):
    """
    Заполняет данные словари для статистики словами
    :param text: слова
    :param words_freq: словарь для сохранения частот
    :param words_len: словарь для сохранения длин